This analysis was used in the short-read (Illumina) taxonomic based metagenomic analysis of noma vs healthy human saliva samples in R posted as a preprint on bioRxiv:
Shotgun metagenomic analysis of the oral microbiomes of noma patients reveals a novel disease-associated organism and potential avenues for disease prevention
Olaleye, M., O’Ferrall A.M., Goodman, R.N. et al.Â
Install all necessary packages into R.
# Make sure these are all installed as packages first
# Load necessary libraries
library(readxl)
library(pheatmap)
library(dplyr)
library(tidyr)
library(tibble)
library(RColorBrewer)
library(miaViz)
library(scater)
library(mia)
library(TreeSummarizedExperiment)
library(here)
library(readr)
library(phyloseq)
library(DESeq2)
library(ggsignif)
library(ggrepel)
library(gridExtra)
library(vegan)
library(randomForest)
library(e1071)
library(pROC)
library(ROCR)
library(caret)
We will be importing MetaPhlan style Bracken data
First we’ll write a function to import MetaPhlan style bracken data
file_path = "data/noma_HMP_saliva_bracken_MetaPhlan_style_report_bacteria_noma_v_healthy_saliva_A1_A19_corrGB.txt"
sample_data_path = "data/Samples_healthy_v_noma_saliva_A1_A19_corrGB.csv"
# Import data
tse_metaphlan_noma = loadFromMetaphlan(file_path)
# Defining the TSE for the rest of the script
tse_metaphlan = tse_metaphlan_noma
patient_metadata = read_excel("data/micro_study_metadata.xlsx")
sample_to_patient = read_excel("data/sample_to_patient_A1_A40.xlsx")
metadata = dplyr::inner_join(patient_metadata, sample_to_patient, by = "respondent_id")
metadata_2 = metadata %>% filter(sample_name %in% colnames(tse_metaphlan_noma))
coldata = data.frame(sample_name = colnames(tse_metaphlan_noma))
metadata_3 = dplyr::left_join(coldata, metadata_2, by = "sample_name")
# Create a DataFrame with this information
metadata_df = DataFrame(metadata_3)
rownames(metadata_df) = metadata_3$sample_name
t_metadata_df = t(metadata_df)
ncol(t_metadata_df)
## [1] 37
# Add this DataFrame as colData to your TreeSummarizedExperiment object
colData(tse_metaphlan_noma) = metadata_df
tse_metaphlan = tse_metaphlan_noma
# Assume your TreeSummarizedExperiment object is called `tse`
# Get the current sample names (column names)
sample_names <- colnames(tse_metaphlan)
sample_names
## [1] "A10" "A11" "A13" "A14" "A16"
## [6] "A17" "A18" "A19" "A1" "A2"
## [11] "A3" "A4" "A5" "A6" "A7"
## [16] "A8" "A9" "DRR214959" "DRR214960" "DRR214961"
## [21] "DRR214962" "DRR241310" "SRR5892208" "SRR5892209" "SRR5892210"
## [26] "SRR5892211" "SRR5892212" "SRR5892213" "SRR5892214" "SRR5892215"
## [31] "SRR5892216" "SRR5892217" "SRS013942" "SRS014468" "SRS014692"
## [36] "SRS015055" "SRS019120"
# Create a vector indicating the condition (diseased or healthy)
# sample_1 to sample_17 are diseased, and sample_18 to sample_37 are healthy
condition <- c(rep("diseased", 17), rep("healthy", 20))
# Create a DataFrame with this information
sample_metadata_disease <- DataFrame(condition = condition)
rownames(sample_metadata_disease) = sample_names
sample_metadata_disease
## DataFrame with 37 rows and 1 column
## condition
## <character>
## A10 diseased
## A11 diseased
## A13 diseased
## A14 diseased
## A16 diseased
## ... ...
## SRS013942 healthy
## SRS014468 healthy
## SRS014692 healthy
## SRS015055 healthy
## SRS019120 healthy
# Add this DataFrame as colData to your TreeSummarizedExperiment object
colData(tse_metaphlan) <- sample_metadata_disease
# Check that colData was added successfully
head(as.data.frame(colData(tse_metaphlan)), n = 37)
## condition
## A10 diseased
## A11 diseased
## A13 diseased
## A14 diseased
## A16 diseased
## A17 diseased
## A18 diseased
## A19 diseased
## A1 diseased
## A2 diseased
## A3 diseased
## A4 diseased
## A5 diseased
## A6 diseased
## A7 diseased
## A8 diseased
## A9 diseased
## DRR214959 healthy
## DRR214960 healthy
## DRR214961 healthy
## DRR214962 healthy
## DRR241310 healthy
## SRR5892208 healthy
## SRR5892209 healthy
## SRR5892210 healthy
## SRR5892211 healthy
## SRR5892212 healthy
## SRR5892213 healthy
## SRR5892214 healthy
## SRR5892215 healthy
## SRR5892216 healthy
## SRR5892217 healthy
## SRS013942 healthy
## SRS014468 healthy
## SRS014692 healthy
## SRS015055 healthy
## SRS019120 healthy
You can inspect the data with these functions.
The output is not printed here as it would be too large.
(count <- assays(tse_metaphlan)[[1]])
head(rowData(tse_metaphlan))
head(colData(tse_metaphlan))
head(metadata(tse_metaphlan))
phyloseq_metaphlan = makePhyloseqFromTreeSE(tse_metaphlan)
phyloseq_obj = makePhyloseqFromTreeSE(tse_metaphlan)
Non-parametric methods were used to determine the difference between samples based on the categorical variable of disease status. Non-parametric tests are used for metagenomic data due to the non-normal distribution of the data. We used the Bray-Curtis dissimilarity matrix for all these tests.
# See above "Converting TSE to other common data formats e.g. Phyloseq"
# Use makePhyloseqFromTreeSE from Miaverse
tse_metaphlan_noma = tse_metaphlan
# make an assay for abundance
tse_metaphlan_noma <- transformAssay(tse_metaphlan_noma, assay.type="counts", method="relabundance")
taxonomyRanks(tse_metaphlan_noma)
## [1] "Kingdom" "Phylum" "Class" "Order" "Family" "Genus" "Species"
colData(tse_metaphlan_noma)
## DataFrame with 37 rows and 1 column
## condition
## <character>
## A10 diseased
## A11 diseased
## A13 diseased
## A14 diseased
## A16 diseased
## ... ...
## SRS013942 healthy
## SRS014468 healthy
## SRS014692 healthy
## SRS015055 healthy
## SRS019120 healthy
# make an altExp and matrix for order
tse_metaphlan_noma_genus = altExp(tse_metaphlan_noma, "Genus")
# Add metadata to colData
colData(tse_metaphlan_noma_genus)
## DataFrame with 37 rows and 0 columns
colData(tse_metaphlan_noma_genus) <- sample_metadata_disease
metadata_df = colData(tse_metaphlan_noma_genus)
metadata_noma_genus = as.data.frame(colData(tse_metaphlan_noma_genus))
metadata_noma_genus
## condition
## A10 diseased
## A11 diseased
## A13 diseased
## A14 diseased
## A16 diseased
## A17 diseased
## A18 diseased
## A19 diseased
## A1 diseased
## A2 diseased
## A3 diseased
## A4 diseased
## A5 diseased
## A6 diseased
## A7 diseased
## A8 diseased
## A9 diseased
## DRR214959 healthy
## DRR214960 healthy
## DRR214961 healthy
## DRR214962 healthy
## DRR241310 healthy
## SRR5892208 healthy
## SRR5892209 healthy
## SRR5892210 healthy
## SRR5892211 healthy
## SRR5892212 healthy
## SRR5892213 healthy
## SRR5892214 healthy
## SRR5892215 healthy
## SRR5892216 healthy
## SRR5892217 healthy
## SRS013942 healthy
## SRS014468 healthy
## SRS014692 healthy
## SRS015055 healthy
## SRS019120 healthy
# genus
phyloseq_noma = makePhyloseqFromTreeSE(tse_metaphlan_noma_genus)
phyloseq_noma_esd = transform_sample_counts(phyloseq_noma, function(x) 1E6 * x/sum(x))
ntaxa(phyloseq_noma_esd)
## [1] 1757
nsamples(phyloseq_noma_esd)
## [1] 37
set.seed(123456)
# Calculate bray curtis distance matrix on main variables
noma.bray = phyloseq::distance(phyloseq_noma_esd, method = "bray")
sample.noma.df <- data.frame(sample_data(phyloseq_noma_esd))
permanova_all = vegan::adonis2(noma.bray ~ condition , data = sample.noma.df)
permanova_all
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 999
##
## vegan::adonis2(formula = noma.bray ~ condition, data = sample.noma.df)
## Df SumOfSqs R2 F Pr(>F)
## Model 1 2.5163 0.40556 23.879 0.001 ***
## Residual 35 3.6881 0.59444
## Total 36 6.2044 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Next we will test the beta dispersion
# All together now
vegan::adonis2(noma.bray ~ condition, data = sample.noma.df)
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 999
##
## vegan::adonis2(formula = noma.bray ~ condition, data = sample.noma.df)
## Df SumOfSqs R2 F Pr(>F)
## Model 1 2.5163 0.40556 23.879 0.001 ***
## Residual 35 3.6881 0.59444
## Total 36 6.2044 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
beta <- betadisper(noma.bray, sample.noma.df$condition)
permutest(beta)
##
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 999
##
## Response: Distances
## Df Sum Sq Mean Sq F N.Perm Pr(>F)
## Groups 1 0.07827 0.078269 3.8016 999 0.056 .
## Residuals 35 0.72060 0.020589
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# we don't want this to be significant
condition_group = get_variable(phyloseq_noma_esd, "condition")
set.seed (123456)
anosim(distance(phyloseq_noma_esd, "bray"), condition_group)
##
## Call:
## anosim(x = distance(phyloseq_noma_esd, "bray"), grouping = condition_group)
## Dissimilarity: bray
##
## ANOSIM statistic R: 0.734
## Significance: 0.001
##
## Permutation: free
## Number of permutations: 999
condition_ano = anosim(distance(phyloseq_noma_esd, "bray"), condition_group)
condition_ano
##
## Call:
## anosim(x = distance(phyloseq_noma_esd, "bray"), grouping = condition_group)
## Dissimilarity: bray
##
## ANOSIM statistic R: 0.734
## Significance: 0.001
##
## Permutation: free
## Number of permutations: 999
#condition
noma.bray <- phyloseq::distance(phyloseq_noma_esd, method = "bray") # Calculate bray curtis distance matrix
condition_group = get_variable(phyloseq_noma_esd, "condition") # Make condition Grouping
# Run MRPP
set.seed(123456)
vegan::mrpp(noma.bray, condition_group, permutations = 999,
weight.type = 1, strata = NULL, parallel = getOption("mc.cores"))
##
## Call:
## vegan::mrpp(dat = noma.bray, grouping = condition_group, permutations = 999, weight.type = 1, strata = NULL, parallel = getOption("mc.cores"))
##
## Dissimilarity index: bray
## Weights for groups: n
##
## Class means and counts:
##
## diseased healthy
## delta 0.4931 0.3533
## n 17 20
##
## Chance corrected within-group agreement A: 0.2364
## Based on observed delta 0.4176 and expected delta 0.5468
##
## Significance of delta: 0.001
## Permutation: free
## Number of permutations: 999
# create a
tse_metaphlan_genus = altExp(tse_metaphlan, "Genus")
# Extract the counts and taxonomic table
counts <- assay(tse_metaphlan_genus, "counts")
tax_table <- rowData(tse_metaphlan_genus)$Genus # Replace "Genus" with your taxonomic level of interest
sample_data <- colData(tse_metaphlan_genus)
groups = as.data.frame(sample_data)
# Aggregate counts by Genus
aggregated_counts <- rowsum(counts, tax_table)
# Create a new aggregated TreeSummarizedExperiment object
tse_aggregated <- TreeSummarizedExperiment(assays = list(counts = aggregated_counts),
colData = sample_data)
# Calculate relative abundances
relative_abundances <- sweep(assay(tse_aggregated, "counts"), 2, colSums(assay(tse_aggregated, "counts")), FUN = "/") * 100
# Convert to a data frame and group by Treatment
relative_df <- as.data.frame(t(relative_abundances))
set.seed (123456)
# Define the vector of genera names (without the "g__" prefix)
genera <- c("Prevotella", "Treponema", "Neisseria", "Bacteroides",
"Filifactor", "Porphyromonas", "Fusobacterium", "Escherichia",
"Selenomonas", "Aggregatibacter", "Capnocytophaga")
# Initialize an empty data frame to store the results
permanova_taxa_results <- data.frame(Genus = character(), pvalue = numeric(), stringsAsFactors = FALSE)
# Loop over each genus
for (genus in genera) {
set.seed (123456)
# Subset the data for the genus; adjust column selection as needed
subset_data <- relative_df %>% select(paste0("g__", genus))
# Calculate the Bray-Curtis distance
bray_dist <- vegdist(subset_data, method = "bray")
# Run PERMANOVA using adonis2
adonis_result <- adonis2(bray_dist ~ condition, data = groups)
# Extract the p-value for the sample_type factor (usually in the first row)
pval <- adonis_result$`Pr(>F)`[1]
# Append the result to the results data frame
permanova_taxa_results <- rbind(permanova_taxa_results, data.frame(Genus = genus, pvalue = pval))
}
print(permanova_taxa_results)
## Genus pvalue
## 1 Prevotella 0.014
## 2 Treponema 0.001
## 3 Neisseria 0.035
## 4 Bacteroides 0.001
## 5 Filifactor 0.001
## 6 Porphyromonas 0.001
## 7 Fusobacterium 0.002
## 8 Escherichia 0.002
## 9 Selenomonas 0.003
## 10 Aggregatibacter 0.002
## 11 Capnocytophaga 0.016
The top 20 most abundant genera were selected from across the entire dataset and visualised with the plotAbundance function of miaViz.
# Check taxonomy ranks
taxonomyRanks(tse_metaphlan)
## [1] "Kingdom" "Phylum" "Class" "Order" "Family" "Genus" "Species"
# make an assay for abundance
tse_metaphlan <- transformAssay(tse_metaphlan, assay.type="counts", method="relabundance")
# make an altExp and matrix for Genus
altExp(tse_metaphlan,"Genus") <- agglomerateByRank(tse_metaphlan,"Genus")
# make a dataframe of relative abundance
relabundance_df_Genus <- as.data.frame(assay(altExp(tse_metaphlan, "Genus"), "relabundance"))
# make a matric of relative abundance
relabundance_matrix_Genus <- assay(altExp(tse_metaphlan, "Genus"), "relabundance")
# calculate the total relative abundance of each Genus (row sums)
total_relabundance_Genus <- rowSums(relabundance_matrix_Genus)
# Identify the top 20 top Genuss
top_Genus <- names(sort(total_relabundance_Genus, decreasing = TRUE)[1:20])
# Delete everything from start to Genus
top_Genus = sub(".*_g__","",top_Genus)
# Add Genus back in
top_Genus = paste0(paste(rep("g__", length(top_Genus)), top_Genus))
# Delete the space introduced by this
top_Genus = sub(" ","",top_Genus)
top_Genus
## [1] "g__Prevotella" "g__Streptococcus" "g__Neisseria"
## [4] "g__Haemophilus" "g__Veillonella" "g__Rothia"
## [7] "g__Fusobacterium" "g__Treponema" "g__Schaalia"
## [10] "g__Escherichia" "g__Aggregatibacter" "g__Selenomonas"
## [13] "g__Leptotrichia" "g__Actinomyces" "g__Campylobacter"
## [16] "g__Capnocytophaga" "g__Porphyromonas" "g__Bacteroides"
## [19] "g__Gemella" "g__Filifactor"
# make a new tse_metaphlan where the top 14 Genuss are recognised, while others are "other"
tse_metaphlan_top_20_Genus <- tse_metaphlan
rowData(tse_metaphlan_top_20_Genus)$Genus <- ifelse(rowData(tse_metaphlan_top_20_Genus)$Genus %in% top_Genus, rowData(tse_metaphlan_top_20_Genus)$Genus, "-other")
genus_colors <- c(
"-other" = "#E41A1C",
"g__Actinomyces" = "#377EB8",
"g__Aggregatibacter" = "#4DAF4A",
"g__Bacteroides" = "#984EA3",
"g__Campylobacter" = "#FF7F00",
"g__Capnocytophaga" = "#FFFF33",
"g__Dialister" = "#E7298A",
"g__Escherichia" = "#A65628",
"g__Filifactor" = "#F781BF",
"g__Fusobacterium" = "#999999",
"g_Gemella" = "#1B9E77",
"g__Haemophilus" = "#D95F02",
"g__Leptotrichia" = "#7570B3",
"g__Neisseria" = "#E7298A",
"g__Porphyromonas" = "#66A61E",
"g__Prevotella" = "#E6AB02",
"g__Rothia" = "#66C2A5",
"g__Schaalia" = "#FC8D62",
"g__Selenomonas" = "#8DA0CB",
"g__Streptococcus" = "#E78AC3",
"g__Tannerella" = "#E41A1C",
"g__Treponema" = "#A6D854",
"g__Veillonella" = "#FFD92F"
)
Genus_plot <- plotAbundance(tse_metaphlan_top_20_Genus,
assay.type = "relabundance",
rank = "Genus",
add_x_text = TRUE) +
theme(plot.margin = ggplot2::margin(t = 30, r = 10, b = 10, l = 10))
Genus_plot_cols = Genus_plot + scale_fill_manual(values=genus_colors)
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
Genus_plot_cols
# Get numbers in table
# Total
top_Genus_numbers_basic = sort(total_relabundance_Genus, decreasing = TRUE)
top_Genus_numbers_df = as.data.frame(top_Genus_numbers_basic)
top_Genus_numbers = as.tibble(top_Genus_numbers_df)
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` instead.
## ℹ The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
rownames(top_Genus_numbers_df) <- gsub(".*(g__*)", "\\1", rownames(top_Genus_numbers_df))
top_Genus_pc = top_Genus_numbers %>%
mutate(top_Genus_percentage = (top_Genus_numbers_basic/sum(top_Genus_numbers$top_Genus_numbers_basic)) * 100) %>%
mutate(top_Genus = rownames(top_Genus_numbers_df))
sum(top_Genus_pc$top_Genus_percentage)
## [1] 100
top_Genus_pc$top_20_Genus = ifelse(top_Genus_pc$top_Genus %in% top_Genus, top_Genus_pc$top_Genus, "-other")
top_Genus_pc$top_Genus_above_1 = ifelse(top_Genus_pc$top_Genus_percentage > 1, top_Genus_pc$top_Genus, "-other")
top_Genus_above_1 = unique(top_Genus_pc$top_Genus_above_1)
top_Genus_pc$top_Genus_above_0.1 = ifelse(top_Genus_pc$top_Genus_percentage > 0.1, top_Genus_pc$top_Genus, "-other")
top_Genus_above_0.1 = unique(top_Genus_pc$top_Genus_above_0.1)
mat_colors <- c(brewer.pal(8, "Dark2"), brewer.pal(8, "Set1"), brewer.pal(8, "Set2"))
# Create the abundance plot with a single stacked bar for 1%
p = ggplot(top_Genus_pc, aes(x = 1, y = top_Genus_percentage, fill = top_Genus_above_1)) +
geom_bar(stat = "identity") +
labs(x = "", y = "Abundance (%)", title = "Abundance Plot of Genus") +
theme_minimal() +
scale_fill_manual(values = genus_colors, name = "Top genera above 0.1%") +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
p
## 4.3 Statistical differences in relative abundance of top 20
genera
plot_relab_boxplot = function(genus = relative_df$g__Prevotella,
sample_data = sample_data,
title = "Prevotella",
...) {
rel_df = data.frame (
genus = genus,
condition = sample_data
)
# Create the boxplot with significance
p = ggplot(rel_df, aes(x = condition, y = genus, fill = condition)) +
geom_boxplot() +
geom_signif(comparisons = list(c("healthy", "diseased")),
map_signif_level = FALSE,
test = wilcox.test) +
labs(title = title,
x = "condition",
y = "Relative Abundance") +
theme_minimal() +
scale_fill_manual(values = c("healthy" = "#4682B4", "diseased" = "#E41A1C")) +
theme(legend.position = "none")
print(p)
}
Actinomyces_rel_bp = plot_relab_boxplot(genus = relative_df$g__Actinomyces,
sample_data = sample_data,
title = "Actinomyces")
Aggregatibacter_rel_bp = plot_relab_boxplot(genus = relative_df$g__Aggregatibacter,
sample_data = sample_data,
title = "Aggregatibacter")
Bacteroides_rel_bp = plot_relab_boxplot(genus = relative_df$g__Bacteroides,
sample_data = sample_data,
title = "Bacteroides")
Campylobacter_rel_bp = plot_relab_boxplot(genus = relative_df$g__Campylobacter,
sample_data = sample_data,
title = "Campylobacter")
Capnocytophaga_rel_bp = plot_relab_boxplot(genus = relative_df$g__Capnocytophaga,
sample_data = sample_data,
title = "Capnocytophaga")
Dialister_rel_bp = plot_relab_boxplot(genus = relative_df$g__Dialister,
sample_data = sample_data,
title = "Dialister")
Escherichia_rel_bp = plot_relab_boxplot(genus = relative_df$g__Escherichia,
sample_data = sample_data,
title = "Escherichia")
Filifactor_rel_bp = plot_relab_boxplot(genus = relative_df$g__Filifactor,
sample_data = sample_data,
title = "Filifactor")
Fusobacterium_rel_bp = plot_relab_boxplot(genus = relative_df$g__Fusobacterium,
sample_data = sample_data,
title = " Fusobacterium")
Gemella_rel_bp = plot_relab_boxplot(genus = relative_df$g__Gemella,
sample_data = sample_data,
title = "Gemella")
Haemophilus_rel_bp = plot_relab_boxplot(genus = relative_df$g__Haemophilus,
sample_data = sample_data,
title = "Haemophilus")
Leptotrichia_rel_bp = plot_relab_boxplot(genus = relative_df$g__Leptotrichia,
sample_data = sample_data,
title = "Leptotrichia")
Neisseria_rel_bp = plot_relab_boxplot(genus = relative_df$g__Neisseria,
sample_data = sample_data,
title = "Neisseria")
Porphyromonas_rel_bp = plot_relab_boxplot(genus = relative_df$g__Porphyromonas,
sample_data = sample_data,
title = "Porphyromonas")
Prevotella_rel_bp = plot_relab_boxplot(genus = relative_df$g__Prevotella,
sample_data = sample_data,
title = "Prevotella")
Rothia_rel_bp = plot_relab_boxplot(genus = relative_df$g__Rothia,
sample_data = sample_data,
title = "Rothia")
Schaalia_rel_bp = plot_relab_boxplot(genus = relative_df$g__Schaalia,
sample_data = sample_data,
title = "Schaalia")
Selenomonas_rel_bp = plot_relab_boxplot(genus = relative_df$g__Selenomonas,
sample_data = sample_data,
title = "Selenomonas")
Streptococcus_rel_bp = plot_relab_boxplot(genus = relative_df$g__Streptococcus,
sample_data = sample_data,
title = "Streptococcus")
Tannerella_rel_bp = plot_relab_boxplot(genus = relative_df$g__Tannerella,
sample_data = sample_data,
title = "Tannerella")
Treponema_rel_bp = plot_relab_boxplot(genus = relative_df$g__Treponema,
sample_data = sample_data,
title = "Treponema")
Veillonella_rel_bp = plot_relab_boxplot(genus = relative_df$g__Veillonella,
sample_data = sample_data,
title = "Veillonella")
grid.arrange(Actinomyces_rel_bp, Gemella_rel_bp, Rothia_rel_bp, Schaalia_rel_bp, Streptococcus_rel_bp, Veillonella_rel_bp,
Aggregatibacter_rel_bp,Bacteroides_rel_bp, Capnocytophaga_rel_bp, Dialister_rel_bp, Escherichia_rel_bp, Filifactor_rel_bp,
Neisseria_rel_bp, Porphyromonas_rel_bp, Prevotella_rel_bp, Selenomonas_rel_bp, Tannerella_rel_bp, Treponema_rel_bp,
ncol=6, top = "Relative Abundance")
Differential analysis used the DESeq2 model on normalised count data and determined fold-change and significant differences between noma and healthy samples at the genera level.
Use makePhyloseqFromTreeSE from Miaverse to convert
# Assume your TreeSummarizedExperiment object is called `tse`
# Get the current sample names (column names)
tse_metaphlan_genus = altExp(tse_metaphlan, "Genus")
sample_names_genus <- colnames(tse_metaphlan_genus)
sample_names_genus
## [1] "A10" "A11" "A13" "A14" "A16"
## [6] "A17" "A18" "A19" "A1" "A2"
## [11] "A3" "A4" "A5" "A6" "A7"
## [16] "A8" "A9" "DRR214959" "DRR214960" "DRR214961"
## [21] "DRR214962" "DRR241310" "SRR5892208" "SRR5892209" "SRR5892210"
## [26] "SRR5892211" "SRR5892212" "SRR5892213" "SRR5892214" "SRR5892215"
## [31] "SRR5892216" "SRR5892217" "SRS013942" "SRS014468" "SRS014692"
## [36] "SRS015055" "SRS019120"
# Create a vector indicating the condition (diseased or healthy)
# Assuming sample_1 to sample_19 are diseased, and sample_20 to sample_22 are healthy
condition <- c(rep("diseased", 17), rep("healthy", 20))
# Create a DataFrame with this information
sample_metadata_disease <- DataFrame(condition = condition)
rownames(sample_metadata_disease) = sample_names
sample_metadata_disease
## DataFrame with 37 rows and 1 column
## condition
## <character>
## A10 diseased
## A11 diseased
## A13 diseased
## A14 diseased
## A16 diseased
## ... ...
## SRS013942 healthy
## SRS014468 healthy
## SRS014692 healthy
## SRS015055 healthy
## SRS019120 healthy
head(as.data.frame(sample_metadata_disease), n = 37)
## condition
## A10 diseased
## A11 diseased
## A13 diseased
## A14 diseased
## A16 diseased
## A17 diseased
## A18 diseased
## A19 diseased
## A1 diseased
## A2 diseased
## A3 diseased
## A4 diseased
## A5 diseased
## A6 diseased
## A7 diseased
## A8 diseased
## A9 diseased
## DRR214959 healthy
## DRR214960 healthy
## DRR214961 healthy
## DRR214962 healthy
## DRR241310 healthy
## SRR5892208 healthy
## SRR5892209 healthy
## SRR5892210 healthy
## SRR5892211 healthy
## SRR5892212 healthy
## SRR5892213 healthy
## SRR5892214 healthy
## SRR5892215 healthy
## SRR5892216 healthy
## SRR5892217 healthy
## SRS013942 healthy
## SRS014468 healthy
## SRS014692 healthy
## SRS015055 healthy
## SRS019120 healthy
# Add this DataFrame as colData to your TreeSummarizedExperiment object
colData(tse_metaphlan_genus) <- sample_metadata_disease
# Check that colData was added successfully
colData(tse_metaphlan_genus)
## DataFrame with 37 rows and 1 column
## condition
## <character>
## A10 diseased
## A11 diseased
## A13 diseased
## A14 diseased
## A16 diseased
## ... ...
## SRS013942 healthy
## SRS014468 healthy
## SRS014692 healthy
## SRS015055 healthy
## SRS019120 healthy
# Use makePhyloseqFromTreeSE from Miaverse
phyloseq_metaphlan_genus = makePhyloseqFromTreeSE(tse_metaphlan_genus)
deseq2_metaphlan_genus = phyloseq_to_deseq2(phyloseq_metaphlan_genus, design = ~condition)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds_genus = deseq2_metaphlan_genus
design(dds_genus) <- ~ condition # Replace with your column name for condition
# Run DESeq2 analysis
dds_genus <- DESeq(dds_genus)
# Extract results for diseased vs healthy
res_genus <- results(dds_genus, contrast = c("condition", "diseased", "healthy"))
string <- "NA_p__Actinobacteria_c__Actinobacteria_o__Streptomycetales_f__Streptomycetaceae_g__Streptomyces"
string_result <- gsub(".*(g__Streptomyces)", "\\1", string)
print(string_result)
## [1] "g__Streptomyces"
# Clean up genus names for dds
rownames(dds_genus) <- gsub(".*(g__*)", "\\1", rownames(dds_genus))
# Clean up genus names for res
res_genus@rownames <- gsub(".*(g__*)", "\\1", res_genus@rownames)
deseq_volcano_g = ggplot(as.data.frame(res_genus),
aes(x = log2FoldChange,
y = -log10(pvalue),
color = ifelse(-log10(pvalue) > 1.3, log2FoldChange > 0, "grey"),
label = ifelse(-log10(pvalue) > 1.3, as.character(rownames(as.data.frame(res_genus))), ''))) +
geom_point() +
geom_hline(yintercept = 1.3) +
scale_color_manual(name = "Log2 fold change",
values = c("TRUE" = "#E47273", "FALSE" = "#6DA6CE", "grey" = "grey")) +
theme_minimal() +
labs(title = "Differential Abundance Analysis",
x = "Log2 Fold Change",
y = "-Log10 P-Value") +
geom_label_repel()
deseq_volcano_g
## Warning: ggrepel: 723 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
# Sort summary list by p-value
res_ordered_genus <- res_genus[order(res_genus$padj),]
head(res_ordered_genus)
## log2 fold change (MLE): condition diseased vs healthy
## Wald test p-value: condition diseased vs healthy
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## g__Streptococcus 1.20291e+06 -4.55825 0.372011 -12.2530 1.61898e-34
## g__Wolinella 8.26901e+01 4.31475 0.366735 11.7653 5.89076e-32
## g__Atopobium 1.71802e+04 -5.01526 0.443219 -11.3155 1.09918e-29
## g__Mesorhizobium 1.10485e+03 3.43460 0.323326 10.6227 2.33594e-26
## g__Mogibacterium 1.99595e+04 -3.38475 0.319175 -10.6047 2.83487e-26
## g__Treponema 5.17980e+04 4.65427 0.445544 10.4463 1.52395e-25
## padj
## <numeric>
## g__Streptococcus 2.63084e-31
## g__Wolinella 4.78624e-29
## g__Atopobium 5.95388e-27
## g__Mesorhizobium 9.21334e-24
## g__Mogibacterium 9.21334e-24
## g__Treponema 4.12737e-23
head(res_ordered_genus, n =20)
## log2 fold change (MLE): condition diseased vs healthy
## Wald test p-value: condition diseased vs healthy
## DataFrame with 20 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## g__Streptococcus 1.20291e+06 -4.55825 0.372011 -12.2530 1.61898e-34
## g__Wolinella 8.26901e+01 4.31475 0.366735 11.7653 5.89076e-32
## g__Atopobium 1.71802e+04 -5.01526 0.443219 -11.3155 1.09918e-29
## g__Mesorhizobium 1.10485e+03 3.43460 0.323326 10.6227 2.33594e-26
## g__Mogibacterium 1.99595e+04 -3.38475 0.319175 -10.6047 2.83487e-26
## ... ... ... ... ... ...
## g__Trueperella 369.261 -3.88747 0.446248 -8.71147 2.99966e-18
## g__Brochothrix 104.056 -1.75310 0.202062 -8.67605 4.09762e-18
## g__Cellulomonas 276.053 -3.45878 0.402625 -8.59058 8.65346e-18
## g__Carnobacterium 965.031 -2.28987 0.267449 -8.56188 1.11039e-17
## g__Helicobacter 466.085 1.54649 0.182290 8.48371 2.18116e-17
## padj
## <numeric>
## g__Streptococcus 2.63084e-31
## g__Wolinella 4.78624e-29
## g__Atopobium 5.95388e-27
## g__Mesorhizobium 9.21334e-24
## g__Mogibacterium 9.21334e-24
## ... ...
## g__Trueperella 3.04653e-16
## g__Brochothrix 3.91684e-16
## g__Cellulomonas 7.81215e-16
## g__Carnobacterium 9.49675e-16
## g__Helicobacter 1.77219e-15
# Filter for significant species
significant_genus <- as.data.frame(res_genus) %>%
filter(padj < 0.05)
head(significant_genus)
## baseMean log2FoldChange lfcSE stat
## g__Coprothermobacter 5.335380 1.4410253 0.5116803 2.816261
## g__Caldisericum 17.227644 1.8273856 0.3686445 4.957040
## g__Endomicrobium 19.087437 1.1224406 0.3217218 3.488855
## g__Thermodesulfobacterium 36.973878 0.6180491 0.2182316 2.832078
## g__Thermovibrio 8.442273 1.2631596 0.4095794 3.084041
## g__Hydrogenobaculum 10.621548 1.5927663 0.5268100 3.023417
## pvalue padj
## g__Coprothermobacter 4.858615e-03 1.572759e-02
## g__Caldisericum 7.157526e-07 5.512312e-06
## g__Endomicrobium 4.850945e-04 2.096486e-03
## g__Thermodesulfobacterium 4.624651e-03 1.518194e-02
## g__Thermovibrio 2.042094e-03 7.357877e-03
## g__Hydrogenobaculum 2.499376e-03 8.772108e-03
# Filter for significant species with higher abundance in healthy samples
significant_genus_less_thn_zero <- as.data.frame(res_genus) %>%
filter(padj < 0.05, log2FoldChange < 0)
# Filter for significant species with higher abundance in diseased samples
significant_genus_grtr_thn_zero <- as.data.frame(res_genus) %>%
filter(padj < 0.05, log2FoldChange > 0)
# Print the results
head(significant_genus_grtr_thn_zero)
## baseMean log2FoldChange lfcSE stat
## g__Coprothermobacter 5.335380 1.4410253 0.5116803 2.816261
## g__Caldisericum 17.227644 1.8273856 0.3686445 4.957040
## g__Endomicrobium 19.087437 1.1224406 0.3217218 3.488855
## g__Thermodesulfobacterium 36.973878 0.6180491 0.2182316 2.832078
## g__Thermovibrio 8.442273 1.2631596 0.4095794 3.084041
## g__Hydrogenobaculum 10.621548 1.5927663 0.5268100 3.023417
## pvalue padj
## g__Coprothermobacter 4.858615e-03 1.572759e-02
## g__Caldisericum 7.157526e-07 5.512312e-06
## g__Endomicrobium 4.850945e-04 2.096486e-03
## g__Thermodesulfobacterium 4.624651e-03 1.518194e-02
## g__Thermovibrio 2.042094e-03 7.357877e-03
## g__Hydrogenobaculum 2.499376e-03 8.772108e-03
head(significant_genus_less_thn_zero)
## baseMean
## NA_p__Candidatus_Saccharibacteria_NA_NA_NA_NA 9950.98811
## g__Candidatus_Saccharimonas 132.50097
## g__Petrotoga 22.53548
## g__Luteitalea 20.57572
## NA_p__Planctomycetes_c__Planctomycetia_o__Planctomycetales_NA_NA 16.78372
## g__Gemmata 23.15259
## log2FoldChange
## NA_p__Candidatus_Saccharibacteria_NA_NA_NA_NA -3.2200083
## g__Candidatus_Saccharimonas -2.7674007
## g__Petrotoga -0.6724982
## g__Luteitalea -0.9780208
## NA_p__Planctomycetes_c__Planctomycetia_o__Planctomycetales_NA_NA -1.1875691
## g__Gemmata -1.3204942
## lfcSE
## NA_p__Candidatus_Saccharibacteria_NA_NA_NA_NA 0.4082855
## g__Candidatus_Saccharimonas 0.4198980
## g__Petrotoga 0.2627007
## g__Luteitalea 0.3365252
## NA_p__Planctomycetes_c__Planctomycetia_o__Planctomycetales_NA_NA 0.4243233
## g__Gemmata 0.3807214
## stat
## NA_p__Candidatus_Saccharibacteria_NA_NA_NA_NA -7.886658
## g__Candidatus_Saccharimonas -6.590649
## g__Petrotoga -2.559940
## g__Luteitalea -2.906234
## NA_p__Planctomycetes_c__Planctomycetia_o__Planctomycetales_NA_NA -2.798737
## g__Gemmata -3.468401
## pvalue
## NA_p__Candidatus_Saccharibacteria_NA_NA_NA_NA 3.103870e-15
## g__Candidatus_Saccharimonas 4.379075e-11
## g__Petrotoga 1.046901e-02
## g__Luteitalea 3.658079e-03
## NA_p__Planctomycetes_c__Planctomycetia_o__Planctomycetales_NA_NA 5.130296e-03
## g__Gemmata 5.235659e-04
## padj
## NA_p__Candidatus_Saccharibacteria_NA_NA_NA_NA 1.441083e-13
## g__Candidatus_Saccharimonas 8.471424e-10
## g__Petrotoga 3.016337e-02
## g__Luteitalea 1.230720e-02
## NA_p__Planctomycetes_c__Planctomycetia_o__Planctomycetales_NA_NA 1.644250e-02
## g__Gemmata 2.250779e-03
For noma samples
# Order the results
sig_res_genus <- significant_genus[order(significant_genus$padj),]
head(sig_res_genus)
## baseMean log2FoldChange lfcSE stat pvalue
## g__Streptococcus 1.202906e+06 -4.558247 0.3720108 -12.25300 1.618980e-34
## g__Wolinella 8.269014e+01 4.314750 0.3667349 11.76531 5.890761e-32
## g__Atopobium 1.718017e+04 -5.015261 0.4432187 -11.31554 1.099178e-29
## g__Mesorhizobium 1.104854e+03 3.434604 0.3233255 10.62274 2.335943e-26
## g__Mogibacterium 1.995953e+04 -3.384746 0.3191753 -10.60466 2.834874e-26
## g__Treponema 5.179799e+04 4.654275 0.4455440 10.44627 1.523951e-25
## padj
## g__Streptococcus 2.630843e-31
## g__Wolinella 4.786244e-29
## g__Atopobium 5.953883e-27
## g__Mesorhizobium 9.213340e-24
## g__Mogibacterium 9.213340e-24
## g__Treponema 4.127368e-23
head(sig_res_genus, n= 15)
## baseMean
## g__Streptococcus 1.202906e+06
## g__Wolinella 8.269014e+01
## g__Atopobium 1.718017e+04
## g__Mesorhizobium 1.104854e+03
## g__Mogibacterium 1.995953e+04
## g__Treponema 5.179799e+04
## g__Staphylococcus 4.277333e+03
## g__Filifactor 1.174252e+04
## g__Cutibacterium 9.676835e+02
## NA_p__Chloroflexi_c__Anaerolineae_o__Anaerolineales_f__Anaerolineaceae_NA 1.769642e+02
## g__Bradyrhizobium 8.111957e+02
## g__Lactococcus 2.259492e+03
## g__Kurthia 1.268817e+02
## g__Salinispira 6.477155e+01
## g__Kingella 1.023084e+03
## log2FoldChange
## g__Streptococcus -4.558247
## g__Wolinella 4.314750
## g__Atopobium -5.015261
## g__Mesorhizobium 3.434604
## g__Mogibacterium -3.384746
## g__Treponema 4.654275
## g__Staphylococcus -2.267672
## g__Filifactor 3.876533
## g__Cutibacterium -3.931289
## NA_p__Chloroflexi_c__Anaerolineae_o__Anaerolineales_f__Anaerolineaceae_NA 3.508700
## g__Bradyrhizobium 2.395926
## g__Lactococcus -3.677923
## g__Kurthia -1.632900
## g__Salinispira 3.530439
## g__Kingella 4.225752
## lfcSE
## g__Streptococcus 0.3720108
## g__Wolinella 0.3667349
## g__Atopobium 0.4432187
## g__Mesorhizobium 0.3233255
## g__Mogibacterium 0.3191753
## g__Treponema 0.4455440
## g__Staphylococcus 0.2269665
## g__Filifactor 0.3909906
## g__Cutibacterium 0.4055293
## NA_p__Chloroflexi_c__Anaerolineae_o__Anaerolineales_f__Anaerolineaceae_NA 0.3682544
## g__Bradyrhizobium 0.2517317
## g__Lactococcus 0.3882346
## g__Kurthia 0.1740667
## g__Salinispira 0.3804755
## g__Kingella 0.4697250
## stat
## g__Streptococcus -12.252998
## g__Wolinella 11.765313
## g__Atopobium -11.315544
## g__Mesorhizobium 10.622743
## g__Mogibacterium -10.604662
## g__Treponema 10.446274
## g__Staphylococcus -9.991219
## g__Filifactor 9.914646
## g__Cutibacterium -9.694218
## NA_p__Chloroflexi_c__Anaerolineae_o__Anaerolineales_f__Anaerolineaceae_NA 9.527923
## g__Bradyrhizobium 9.517779
## g__Lactococcus -9.473456
## g__Kurthia -9.380891
## g__Salinispira 9.279020
## g__Kingella 8.996226
## pvalue
## g__Streptococcus 1.618980e-34
## g__Wolinella 5.890761e-32
## g__Atopobium 1.099178e-29
## g__Mesorhizobium 2.335943e-26
## g__Mogibacterium 2.834874e-26
## g__Treponema 1.523951e-25
## g__Staphylococcus 1.665213e-23
## g__Filifactor 3.595287e-23
## g__Cutibacterium 3.190735e-22
## NA_p__Chloroflexi_c__Anaerolineae_o__Anaerolineales_f__Anaerolineaceae_NA 1.604608e-21
## g__Bradyrhizobium 1.769193e-21
## g__Lactococcus 2.707342e-21
## g__Kurthia 6.541730e-21
## g__Salinispira 1.710452e-20
## g__Kingella 2.336101e-19
## padj
## g__Streptococcus 2.630843e-31
## g__Wolinella 4.786244e-29
## g__Atopobium 5.953883e-27
## g__Mesorhizobium 9.213340e-24
## g__Mogibacterium 9.213340e-24
## g__Treponema 4.127368e-23
## g__Staphylococcus 3.865673e-21
## g__Filifactor 7.302927e-21
## g__Cutibacterium 5.761050e-20
## NA_p__Chloroflexi_c__Anaerolineae_o__Anaerolineales_f__Anaerolineaceae_NA 2.607487e-19
## g__Bradyrhizobium 2.613581e-19
## g__Lactococcus 3.666192e-19
## g__Kurthia 8.177163e-19
## g__Salinispira 1.985347e-18
## g__Kingella 2.530776e-17
# Order the results by significance greater than zero (Noma)
sig_res_genus_grtr_thn_zero <- significant_genus_grtr_thn_zero[order(significant_genus_grtr_thn_zero$padj),]
sig_res_genus_grtr_thn_zero$genus = rownames(sig_res_genus_grtr_thn_zero)
head(as.data.frame(sig_res_genus_grtr_thn_zero, n = 30))
## baseMean
## g__Wolinella 82.69014
## g__Mesorhizobium 1104.85379
## g__Treponema 51797.99394
## g__Filifactor 11742.51502
## NA_p__Chloroflexi_c__Anaerolineae_o__Anaerolineales_f__Anaerolineaceae_NA 176.96419
## g__Bradyrhizobium 811.19569
## log2FoldChange
## g__Wolinella 4.314750
## g__Mesorhizobium 3.434604
## g__Treponema 4.654275
## g__Filifactor 3.876533
## NA_p__Chloroflexi_c__Anaerolineae_o__Anaerolineales_f__Anaerolineaceae_NA 3.508700
## g__Bradyrhizobium 2.395926
## lfcSE
## g__Wolinella 0.3667349
## g__Mesorhizobium 0.3233255
## g__Treponema 0.4455440
## g__Filifactor 0.3909906
## NA_p__Chloroflexi_c__Anaerolineae_o__Anaerolineales_f__Anaerolineaceae_NA 0.3682544
## g__Bradyrhizobium 0.2517317
## stat
## g__Wolinella 11.765313
## g__Mesorhizobium 10.622743
## g__Treponema 10.446274
## g__Filifactor 9.914646
## NA_p__Chloroflexi_c__Anaerolineae_o__Anaerolineales_f__Anaerolineaceae_NA 9.527923
## g__Bradyrhizobium 9.517779
## pvalue
## g__Wolinella 5.890761e-32
## g__Mesorhizobium 2.335943e-26
## g__Treponema 1.523951e-25
## g__Filifactor 3.595287e-23
## NA_p__Chloroflexi_c__Anaerolineae_o__Anaerolineales_f__Anaerolineaceae_NA 1.604608e-21
## g__Bradyrhizobium 1.769193e-21
## padj
## g__Wolinella 4.786244e-29
## g__Mesorhizobium 9.213340e-24
## g__Treponema 4.127368e-23
## g__Filifactor 7.302927e-21
## NA_p__Chloroflexi_c__Anaerolineae_o__Anaerolineales_f__Anaerolineaceae_NA 2.607487e-19
## g__Bradyrhizobium 2.613581e-19
## genus
## g__Wolinella g__Wolinella
## g__Mesorhizobium g__Mesorhizobium
## g__Treponema g__Treponema
## g__Filifactor g__Filifactor
## NA_p__Chloroflexi_c__Anaerolineae_o__Anaerolineales_f__Anaerolineaceae_NA NA_p__Chloroflexi_c__Anaerolineae_o__Anaerolineales_f__Anaerolineaceae_NA
## g__Bradyrhizobium g__Bradyrhizobium
deseq_genus_sig_diff_noma = head(sig_res_genus_grtr_thn_zero, n = 30)
# Order the results by change greater than zero (Noma)
change_res_genus_grtr_thn_zero <- significant_genus_grtr_thn_zero[order(significant_genus_grtr_thn_zero$log2FoldChange),]
head(change_res_genus_grtr_thn_zero, n = 15)
## baseMean log2FoldChange lfcSE stat
## g__Polynucleobacter 158.58906 0.3636115 0.1356301 2.680906
## g__Geobacter 197.18389 0.4759009 0.1852261 2.569297
## g__Pandoraea 112.41762 0.4765583 0.2021663 2.357259
## g__Thermoanaerobacterium 199.00680 0.5044903 0.1998842 2.523913
## g__Blattabacterium 91.60746 0.5085865 0.1965401 2.587698
## g__Chlorobium 73.96961 0.5181272 0.1777769 2.914480
## g__Salegentibacter 82.57517 0.5339047 0.2164407 2.466748
## g__Thermosipho 57.03753 0.5379010 0.2015835 2.668379
## g__Arcobacter 528.83484 0.5452385 0.1749888 3.115849
## g__Echinicola 113.89700 0.5461497 0.1670004 3.270351
## g__Brachyspira 254.89486 0.5813467 0.2025750 2.869784
## g__Hungateiclostridium 251.56932 0.5858876 0.2157149 2.716027
## g__Natranaerobius 20.39054 0.5919283 0.2369416 2.498203
## g__Legionella 287.40179 0.6098754 0.1350509 4.515894
## g__Thermodesulfobacterium 36.97388 0.6180491 0.2182316 2.832078
## pvalue padj
## g__Polynucleobacter 7.342307e-03 2.234316e-02
## g__Geobacter 1.019050e-02 2.951793e-02
## g__Pandoraea 1.841040e-02 4.896383e-02
## g__Thermoanaerobacterium 1.160566e-02 3.285575e-02
## g__Blattabacterium 9.661964e-03 2.813744e-02
## g__Chlorobium 3.562811e-03 1.206160e-02
## g__Salegentibacter 1.363462e-02 3.761675e-02
## g__Thermosipho 7.621828e-03 2.308301e-02
## g__Arcobacter 1.834161e-03 6.697778e-03
## g__Echinicola 1.074143e-03 4.236780e-03
## g__Brachyspira 4.107519e-03 1.362187e-02
## g__Hungateiclostridium 6.607043e-03 2.041149e-02
## g__Natranaerobius 1.248245e-02 3.485221e-02
## g__Legionella 6.305023e-06 3.955854e-05
## g__Thermodesulfobacterium 4.624651e-03 1.518194e-02
For healthy samples
# Order the results by significance less than zero (healthy)
sig_res_genus_less_thn_zero <- significant_genus_less_thn_zero[order(significant_genus_less_thn_zero$padj),]
sig_res_genus_less_thn_zero$genus = rownames(sig_res_genus_less_thn_zero)
head(as.data.frame(sig_res_genus_less_thn_zero, n =30))
## baseMean log2FoldChange lfcSE stat pvalue
## g__Streptococcus 1202906.4900 -4.558247 0.3720108 -12.252998 1.618980e-34
## g__Atopobium 17180.1727 -5.015261 0.4432187 -11.315544 1.099178e-29
## g__Mogibacterium 19959.5262 -3.384746 0.3191753 -10.604662 2.834874e-26
## g__Staphylococcus 4277.3332 -2.267672 0.2269665 -9.991219 1.665213e-23
## g__Cutibacterium 967.6835 -3.931289 0.4055293 -9.694218 3.190735e-22
## g__Lactococcus 2259.4921 -3.677923 0.3882346 -9.473456 2.707342e-21
## padj genus
## g__Streptococcus 2.630843e-31 g__Streptococcus
## g__Atopobium 5.953883e-27 g__Atopobium
## g__Mogibacterium 9.213340e-24 g__Mogibacterium
## g__Staphylococcus 3.865673e-21 g__Staphylococcus
## g__Cutibacterium 5.761050e-20 g__Cutibacterium
## g__Lactococcus 3.666192e-19 g__Lactococcus
deseq_genus_sig_diff_healthy = head(sig_res_genus_less_thn_zero, n = 30)
# Order the results by change greater than zero (healthy)
change_res_genus_less_thn_zero <- significant_genus_less_thn_zero[order(significant_genus_less_thn_zero$log2FoldChange),]
head(as.data.frame(change_res_genus_less_thn_zero, n =15))
## baseMean log2FoldChange lfcSE stat
## g__Rothia 3.111581e+05 -5.606861 0.8099458 -6.922513
## g__Atopobium 1.718017e+04 -5.015261 0.4432187 -11.315544
## g__Schaalia 1.275042e+05 -4.845840 0.6891937 -7.031174
## g__Kochikohdavirus 4.688045e+00 -4.830583 0.7413594 -6.515845
## g__Streptococcus 1.202906e+06 -4.558247 0.3720108 -12.252998
## g__Ruania 6.387048e+01 -4.088696 0.5071776 -8.061665
## pvalue padj
## g__Rothia 4.437010e-12 9.613521e-11
## g__Atopobium 1.099178e-29 5.953883e-27
## g__Schaalia 2.048035e-12 4.894202e-11
## g__Kochikohdavirus 7.228141e-11 1.276710e-09
## g__Streptococcus 1.618980e-34 2.630843e-31
## g__Ruania 7.526211e-16 4.367890e-14
taxonomyRanks(tse_metaphlan)
## [1] "Kingdom" "Phylum" "Class" "Order" "Family" "Genus" "Species"
# make an assay for abundance
tse_metaphlan <- transformAssay(tse_metaphlan, assay.type="counts", method="relabundance")
# make an altExp and matrix for Genus
altExp(tse_metaphlan,"Genus") <- agglomerateByRank(tse_metaphlan,"Genus")
# make a dataframe of relative abundance
relabundance_df_Genus <- as.data.frame(assay(altExp(tse_metaphlan, "Genus"), "relabundance"))
# make a matric of relative abundance
relabundance_matrix_Genus <- assay(altExp(tse_metaphlan, "Genus"), "relabundance")
# calculate the total relative abundance of each Genus (row sums)
total_relabundance_Genus <- rowSums(relabundance_matrix_Genus)
# Get the top highly abundant genera based on relative abundance
top_Genus_numbers_basic = sort(total_relabundance_Genus, decreasing = TRUE)
# Make into dataframe
top_Genus_numbers_df = as.data.frame(top_Genus_numbers_basic)
# Make into tibble
top_Genus_numbers = as.tibble(top_Genus_numbers_df)
# Rename genera to remove any higher taxonomic names (a quirk of the metaphaln style)
rownames(top_Genus_numbers_df) <- gsub(".*(g__*)", "\\1", rownames(top_Genus_numbers_df))
# Get percenatge by dividing by totoal number of samples (21) and * by 100
top_Genus_pc = top_Genus_numbers %>%
mutate(top_Genus_percentage = (top_Genus_numbers_basic/21) * 100) %>%
mutate(top_Genus = rownames(top_Genus_numbers_df))
# Select only the top 20 genera by relative abundance
top_Genus_pc$top_20_Genus = ifelse(top_Genus_pc$top_Genus %in% top_Genus, top_Genus_pc$top_Genus, "-other")
# Select only the top genera with a relative abundance above 1%
top_Genus_pc$top_Genus_above_1 = ifelse(top_Genus_pc$top_Genus_percentage > 1, top_Genus_pc$top_Genus, "-other")
top_Genus_above_1 = unique(top_Genus_pc$top_Genus_above_1)
# Select
sig_res_genus_grtr_thn_zero$genera_above_1pc_relab = ifelse(sig_res_genus_grtr_thn_zero$genus %in% top_Genus_above_1, sig_res_genus_grtr_thn_zero$genus, "-other")
sig_res_genus_less_thn_zero$genera_above_1pc_relab = ifelse(sig_res_genus_less_thn_zero$genus %in% top_Genus_above_1, sig_res_genus_less_thn_zero$genus, "-other")
# Check which genera are both signifiantly different and highly abundant
unique(sig_res_genus_grtr_thn_zero$genera_above_1pc_relab)
## [1] "-other" "g__Treponema" "g__Porphyromonas" "g__Bacteroides"
## [5] "g__Selenomonas"
unique(sig_res_genus_less_thn_zero$genera_above_1pc_relab)
## [1] "g__Streptococcus" "-other" "g__Veillonella" "g__Gemella"
## [5] "g__Schaalia" "g__Rothia" "g__Actinomyces" "g__Haemophilus"
head(as.data.frame(sig_res_genus_grtr_thn_zero, n=10))
## baseMean
## g__Wolinella 82.69014
## g__Mesorhizobium 1104.85379
## g__Treponema 51797.99394
## g__Filifactor 11742.51502
## NA_p__Chloroflexi_c__Anaerolineae_o__Anaerolineales_f__Anaerolineaceae_NA 176.96419
## g__Bradyrhizobium 811.19569
## log2FoldChange
## g__Wolinella 4.314750
## g__Mesorhizobium 3.434604
## g__Treponema 4.654275
## g__Filifactor 3.876533
## NA_p__Chloroflexi_c__Anaerolineae_o__Anaerolineales_f__Anaerolineaceae_NA 3.508700
## g__Bradyrhizobium 2.395926
## lfcSE
## g__Wolinella 0.3667349
## g__Mesorhizobium 0.3233255
## g__Treponema 0.4455440
## g__Filifactor 0.3909906
## NA_p__Chloroflexi_c__Anaerolineae_o__Anaerolineales_f__Anaerolineaceae_NA 0.3682544
## g__Bradyrhizobium 0.2517317
## stat
## g__Wolinella 11.765313
## g__Mesorhizobium 10.622743
## g__Treponema 10.446274
## g__Filifactor 9.914646
## NA_p__Chloroflexi_c__Anaerolineae_o__Anaerolineales_f__Anaerolineaceae_NA 9.527923
## g__Bradyrhizobium 9.517779
## pvalue
## g__Wolinella 5.890761e-32
## g__Mesorhizobium 2.335943e-26
## g__Treponema 1.523951e-25
## g__Filifactor 3.595287e-23
## NA_p__Chloroflexi_c__Anaerolineae_o__Anaerolineales_f__Anaerolineaceae_NA 1.604608e-21
## g__Bradyrhizobium 1.769193e-21
## padj
## g__Wolinella 4.786244e-29
## g__Mesorhizobium 9.213340e-24
## g__Treponema 4.127368e-23
## g__Filifactor 7.302927e-21
## NA_p__Chloroflexi_c__Anaerolineae_o__Anaerolineales_f__Anaerolineaceae_NA 2.607487e-19
## g__Bradyrhizobium 2.613581e-19
## genus
## g__Wolinella g__Wolinella
## g__Mesorhizobium g__Mesorhizobium
## g__Treponema g__Treponema
## g__Filifactor g__Filifactor
## NA_p__Chloroflexi_c__Anaerolineae_o__Anaerolineales_f__Anaerolineaceae_NA NA_p__Chloroflexi_c__Anaerolineae_o__Anaerolineales_f__Anaerolineaceae_NA
## g__Bradyrhizobium g__Bradyrhizobium
## genera_above_1pc_relab
## g__Wolinella -other
## g__Mesorhizobium -other
## g__Treponema g__Treponema
## g__Filifactor -other
## NA_p__Chloroflexi_c__Anaerolineae_o__Anaerolineales_f__Anaerolineaceae_NA -other
## g__Bradyrhizobium -other
Plotting for noma
sig_res_genus_grtr_thn_zero_above_1pc_relab = sig_res_genus_grtr_thn_zero %>% filter(genera_above_1pc_relab != "-other")
sig_res_genus_grtr_thn_zero_above_1pc_relab = sig_res_genus_grtr_thn_zero_above_1pc_relab[!grepl("NA_p__Firmicutes_c__Clostridia_o__Clostridiales_f__Lachnospiraceae_NA", sig_res_genus_grtr_thn_zero_above_1pc_relab$genus), ]
padj_plot <- ggplot(sig_res_genus_grtr_thn_zero_above_1pc_relab, aes(x = reorder(genera_above_1pc_relab, `padj`), y = `padj`)) +
geom_bar(stat = "identity", fill = "steelblue") +
geom_text(aes(label = round(`padj`, 4)),
hjust = 1.2, # Adjust horizontal position (slightly outside the bars)
vjust = 0.5, # Center vertically
size = 6, # Adjust text size
color = "black") +
coord_flip() + # Flip to make the plot horizontal
labs(
title = "Most significantly different genera above 1% abundance noma related",
x = "Genus",
y = "Adjusted p-value"
) +
theme_minimal(base_size = 10) +
theme(
plot.title = element_text(hjust = 0.5)# Center the title
)
padj_plot
Plotting for healthy
# Plot for Healthy
sig_res_genus_less_thn_zero_above_1pc_relab = sig_res_genus_less_thn_zero %>% filter(genera_above_1pc_relab != "-other")
sig_res_genus_less_thn_zero_above_1pc_relab = sig_res_genus_less_thn_zero_above_1pc_relab[!grepl("NA_p__Firmicutes_c__Clostridia_o__Clostridiales_f__Clostridiales_Family_XIII._Incertae_Sedis_NA", sig_res_genus_less_thn_zero_above_1pc_relab$genus), ]
sig_res_genus_less_thn_zero_above_1pc_relab = sig_res_genus_less_thn_zero_above_1pc_relab[!grepl("NA_p__Candidatus_Saccharibacteria_NA_NA_NA_NA", sig_res_genus_less_thn_zero_above_1pc_relab$genus), ]
padj_plot <- ggplot(sig_res_genus_less_thn_zero_above_1pc_relab, aes(x = reorder(genera_above_1pc_relab, `padj`), y = `padj`)) +
geom_bar(stat = "identity", fill = "#D1352B") +
geom_text(aes(label = round(`padj`, 4)),
hjust = 1.2, # Adjust horizontal position (slightly outside the bars)
vjust = 0.5, # Center vertically
size = 6, # Adjust text size
color = "black") +
coord_flip() + # Flip to make the plot horizontal
labs(
title = "Most significantly different genera above 1% abundance noma related",
x = "Genus",
y = "Adjusted p-value"
) +
theme_minimal(base_size = 10) +
theme(
plot.title = element_text(hjust = 0.5)# Center the title
)
padj_plot
First we create a function
plotCountsGGanysig <- function(dds, gene, intgroup = "condition", normalized = TRUE,
transform = TRUE, main, xlab = "group", returnData = FALSE,
replaced = FALSE, pc, plot = "point", text = TRUE, showSignificance = TRUE, ...) {
# Check input gene validity
stopifnot(length(gene) == 1 & (is.character(gene) | (is.numeric(gene) &
(gene >= 1 & gene <= nrow(dds)))))
# Check if all intgroup columns exist in colData
if (!all(intgroup %in% names(colData(dds))))
stop("all variables in 'intgroup' must be columns of colData")
# If not returning data, ensure intgroup variables are factors
if (!returnData) {
if (!all(sapply(intgroup, function(v) is(colData(dds)[[v]], "factor")))) {
stop("all variables in 'intgroup' should be factors, or choose returnData=TRUE and plot manually")
}
}
# Set pseudo count if not provided
if (missing(pc)) {
pc <- if (transform) 0.5 else 0
}
# Estimate size factors if missing
if (is.null(sizeFactors(dds)) & is.null(normalizationFactors(dds))) {
dds <- estimateSizeFactors(dds)
}
# Get the counts for the gene
cnts <- counts(dds, normalized = normalized, replaced = replaced)[gene, ]
# Generate grouping variable
group <- if (length(intgroup) == 1) {
colData(dds)[[intgroup]]
} else if (length(intgroup) == 2) {
lvls <- as.vector(t(outer(levels(colData(dds)[[intgroup[1]]]),
levels(colData(dds)[[intgroup[2]]]), function(x, y) paste(x, y, sep = ":"))))
droplevels(factor(apply(as.data.frame(colData(dds)[, intgroup, drop = FALSE]), 1, paste, collapse = ":"),
levels = lvls))
} else {
factor(apply(as.data.frame(colData(dds)[, intgroup, drop = FALSE]), 1, paste, collapse = ":"))
}
# Create the data frame with counts, group, and sample names
data <- data.frame(count = cnts + pc, group = group, sample = colnames(dds), condition = group)
# Set log scale if necessary
logxy <- if (transform) "y" else ""
# Set the plot title
if (missing(main)) {
main <- if (is.numeric(gene)) {
rownames(dds)[gene]
} else {
gene
}
}
# Set the y-axis label based on normalization
ylab <- ifelse(normalized, "normalized count", "count")
# Return the data if requested
if (returnData)
return(data.frame(count = data$count, colData(dds)[intgroup]))
# Create the base ggplot object with data and aesthetic mappings
p <- ggplot(data, aes(x = group, y = count, label = sample, color = condition)) +
labs(x = xlab, y = ylab, title = main) + # Labels and title
theme_minimal() + # Clean theme
scale_y_continuous(trans = ifelse(transform, "log10", "identity")) + # Apply log transformation if needed
scale_color_brewer(palette = "Set1") # Optional: use color brewer for nice color scheme
# Select the type of plot based on the 'plot' argument
if (plot == "point") {
p <- p + geom_point(size = 3)
if (text) p <- p + geom_text(hjust = -0.2, vjust = 0) # Add text if text = TRUE
} else if (plot == "jitter") {
p <- p + geom_jitter(size = 3, width = 0.2)
if (text) p <- p + geom_text(hjust = -0.2, vjust = 0) # Add text if text = TRUE
} else if (plot == "bar") {
p <- p + geom_bar(stat = "summary", fun = "mean", position = "dodge", width = 0.7) + # Bar plot with whiskers
geom_errorbar(stat = "summary", fun.data = "mean_se", width = 0.2)
} else if (plot == "violin") {
p <- p + geom_violin(trim = FALSE) + geom_jitter(size = 2, width = 0.2)
if (text) p <- p + geom_text(hjust = -0.2, vjust = 0) # Add text if text = TRUE
} else if (plot == "box") {
p <- p + geom_boxplot()
if (text) p <- p + geom_text(hjust = -0.2, vjust = 0) # Add text if text = TRUE
} else {
stop("Invalid plot type. Choose from 'point', 'jitter', 'bar', 'violin', or 'box'.")
}
# Add significance annotation if requested
if (showSignificance) {
# Get DESeq2 results for gene using Wald test and BH adjustment
res <- results(dds, contrast = c(intgroup, levels(group)[1], levels(group)[2]), alpha = 0.05)
res_gene <- res[gene, ]
# Check significance and add stars/annotations
if (!is.na(res_gene$padj) && res_gene$padj < 0.05) {
p <- p + annotate("text", x = 1.5, y = max(data$count), label = "*", size = 8)
}
}
print(p)
}
Next we use the function to plot any highly abundant significantly associated with Noma
# Significant highly abundant for Noma above 1%
plot_1 = plotCountsGGanysig(dds_genus, gene="g__Treponema", intgroup = "condition", plot = "box", text = FALSE, showSignificance = TRUE)
plot_2 = plotCountsGGanysig(dds_genus, gene="g__Porphyromonas", intgroup = "condition", plot = "box", text = FALSE, showSignificance = TRUE)
plot_3 = plotCountsGGanysig(dds_genus, gene="g__Bacteroides", intgroup = "condition", plot = "box", text = FALSE, showSignificance = TRUE)
plot_4 = plotCountsGGanysig(dds_genus, gene="g__Selenomonas", intgroup = "condition", plot = "box", text = FALSE, showSignificance = TRUE)
grid.arrange(plot_1, plot_2, plot_3, plot_4, ncol=2)
Then we use the function to plot any highly abundant signifcantly
asscoiated with healthy global dataset
# Significant highly abundant for healthy above 1%
plot_1 = plotCountsGGanysig(dds_genus, gene="g__Streptococcus", intgroup = "condition", plot = "box", text = FALSE, showSignificance = TRUE)
plot_2 = plotCountsGGanysig(dds_genus, gene="g__Veillonella", intgroup = "condition", plot = "box", text = FALSE, showSignificance = TRUE)
plot_3 = plotCountsGGanysig(dds_genus, gene="g__Gemella", intgroup = "condition", plot = "box", text = FALSE, showSignificance = TRUE)
plot_4 = plotCountsGGanysig(dds_genus, gene="g__Schaalia", intgroup = "condition", plot = "box", text = FALSE, showSignificance = TRUE)
plot_5 = plotCountsGGanysig(dds_genus, gene="g__Rothia", intgroup = "condition", plot = "box", text = FALSE, showSignificance = TRUE)
plot_6 = plotCountsGGanysig(dds_genus, gene="g__Actinomyces", intgroup = "condition", plot = "box", text = FALSE, showSignificance = TRUE)
plot_7 = plotCountsGGanysig(dds_genus, gene="g__Haemophilus", intgroup = "condition", plot = "box", text = FALSE, showSignificance = TRUE)
grid.arrange(plot_1, plot_2, plot_3, plot_4,
plot_5, plot_6, plot_7, ncol=2)
# Define the function to update metadata
update_sample_metadata <- function(tse_object) {
# Extract sample names (column names)
sample_names <- colnames(tse_object)
# Create the "accession" column based on sample name prefix
accession <- ifelse(grepl("^SRS", sample_names), "SRS",
ifelse(grepl("^SRR", sample_names), "SRR",
ifelse(grepl("^A", sample_names), "noma",
ifelse(grepl("^DRR", sample_names), "DRR", NA))))
# Create the "location" column based on sample name prefix
location <- ifelse(grepl("^SRS", sample_names), "USA",
ifelse(grepl("^SRR", sample_names), "Denmark",
ifelse(grepl("^A", sample_names), "Nigeria",
ifelse(grepl("^DRR", sample_names), "Japan", NA))))
# Create a DataFrame with the new metadata columns
sample_metadata <- DataFrame(accession = accession, location = location)
rownames(sample_metadata) <- sample_names
# Add the metadata as colData to the TreeSummarizedExperiment object
colData(tse_object) <- sample_metadata
# Return the updated object
return(tse_object)
}
# Example usage:
tse_metaphlan_genus_hc = altExp(tse_metaphlan, "Genus")
tse_metaphlan_genus_hc <- update_sample_metadata(tse_metaphlan_genus_hc)
head(as.data.frame(colData(tse_metaphlan_genus_hc)))
## accession location
## A10 noma Nigeria
## A11 noma Nigeria
## A13 noma Nigeria
## A14 noma Nigeria
## A16 noma Nigeria
## A17 noma Nigeria
# Add this DataFrame as colData to your TreeSummarizedExperiment object
colData(tse_metaphlan_genus_hc)
## DataFrame with 37 rows and 2 columns
## accession location
## <character> <character>
## A10 noma Nigeria
## A11 noma Nigeria
## A13 noma Nigeria
## A14 noma Nigeria
## A16 noma Nigeria
## ... ... ...
## SRS013942 SRS USA
## SRS014468 SRS USA
## SRS014692 SRS USA
## SRS015055 SRS USA
## SRS019120 SRS USA
unique(colData(tse_metaphlan_genus_hc))
## DataFrame with 4 rows and 2 columns
## accession location
## <character> <character>
## A10 noma Nigeria
## DRR214959 DRR Japan
## SRR5892208 SRR Denmark
## SRS013942 SRS USA
# Use makePhyloseqFromTreeSE from Miaverse
phyloseq_metaphlan_hc = makePhyloseqFromTreeSE(tse_metaphlan_genus_hc)
deseq2_metaphlan_hc = phyloseq::phyloseq_to_deseq2(phyloseq_metaphlan_hc, design = ~ accession)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds_hc = deseq2_metaphlan_hc
design(dds_hc) <- ~ accession # Replace with your column name for condition
# Run DESeq2 analysis
dds_hc <- DESeq(dds_hc)
# Clean up genus names for dds
rownames(dds_hc) <- gsub(".*(g__*)", "\\1", rownames(dds_hc))
unique(as.data.frame(colData(tse_metaphlan_genus_hc)))
## accession location
## A10 noma Nigeria
## DRR214959 DRR Japan
## SRR5892208 SRR Denmark
## SRS013942 SRS USA
# Extract results for diseased vs healthy
res_SRR_noma <- results(dds_hc, contrast = c("accession", "noma", "SRR"))
res_DRR_noma <- results(dds_hc, contrast = c("accession", "noma", "DRR"))
res_SRS_noma <- results(dds_hc, contrast = c("accession", "noma", "SRS"))
Here we introduce another function which plots lines for significance
plotCountsGGsigline = function(dds, gene, intgroup = "condition", normalized = TRUE,
transform = TRUE, main, xlab = "group", returnData = FALSE,
replaced = FALSE, pc, plot = "point", text = TRUE,
showSignificance = TRUE, lineSpacing = 0.1,
groupOrder = NULL, ...) {
# input checks
stopifnot(length(gene)==1 && (is.character(gene) ||
(is.numeric(gene) && gene>=1 && gene<=nrow(dds))))
if (!all(intgroup %in% names(colData(dds))))
stop("all variables in 'intgroup' must be columns of colData")
if (!returnData) {
if (!all(sapply(intgroup, function(v) is(colData(dds)[[v]], "factor"))))
stop("all variables in 'intgroup' should be factors, or choose returnData=TRUE")
}
# pseudo-count
if (missing(pc)) pc <- if (transform) 0.5 else 0
# ensure size factors
if (is.null(sizeFactors(dds)) && is.null(normalizationFactors(dds)))
dds <- estimateSizeFactors(dds)
# extract counts + grouping
cnts <- counts(dds, normalized=normalized, replaced=replaced)[gene,]
if (length(intgroup)==1) {
group <- colData(dds)[[intgroup]]
} else {
lvls <- as.vector(t(outer(levels(colData(dds)[[intgroup[1]]]),
levels(colData(dds)[[intgroup[2]]]),
paste, sep=":")))
group <- droplevels(factor(
apply(as.data.frame(colData(dds)[,intgroup]),1,paste,collapse=":"),
levels=lvls))
}
# Set order of groups on the x-axis if specified
if (!is.null(groupOrder)) {
group <- factor(group, levels = groupOrder)
}
data <- data.frame(count=cnts+pc,
group=group,
sample=colnames(dds),
condition=group)
# axis labels
ylab <- ifelse(normalized, "normalized count", "count")
logxy <- ifelse(transform, "y", "")
if (missing(main)) {
main <- if (is.numeric(gene)) rownames(dds)[gene] else gene
}
if (returnData) {
return(data.frame(count=data$count, colData(dds)[intgroup]))
}
# base ggplot
p <- ggplot(data, aes(x=group,y=count,label=sample,
color=condition,group=group)) +
labs(x=xlab, y=ylab, title=main) +
theme_minimal() +
scale_y_continuous(trans=ifelse(transform,"log10","identity")) +
scale_color_brewer(palette="Set1")
# choose geom
if (plot=="point") {
p <- p + geom_point(size=3)
if (text) p <- p + geom_text(hjust=-0.2, vjust=0)
} else if (plot=="jitter") {
p <- p + geom_jitter(size=3,width=0.2)
if (text) p <- p + geom_text(hjust=-0.2, vjust=0)
} else if (plot=="bar") {
p <- p + geom_bar(stat="summary", fun="mean",
position="dodge", width=0.7) +
geom_errorbar(stat="summary", fun.data="mean_se", width=0.2)
} else if (plot=="violin") {
p <- p + geom_violin(trim=FALSE) +
geom_jitter(size=2,width=0.2)
if (text) p <- p + geom_text(hjust=-0.2, vjust=0)
} else if (plot=="box") {
p <- p + geom_boxplot() +
geom_jitter(size=2,width=0.2)
if (text) p <- p + geom_text(hjust=-0.2, vjust=0)
} else {
stop("Invalid plot type. Choose from 'point','jitter','bar','violin','box'.")
}
# add significance lines & stars
if (showSignificance) {
max_y <- max(data$count)
step <- max_y * lineSpacing
counter <- 0
for (i in 1:(nlevels(group) - 1)) {
for (j in (i + 1):nlevels(group)) {
contrast <- c(intgroup, levels(group)[i], levels(group)[j])
res <- results(dds, contrast = contrast, alpha = 0.05)
pv <- as.numeric(res[gene, "padj"])
# Determine label and color
if (is.na(pv) || pv > 0.05) {
lab <- "ns"
col <- "grey70"
size <- 3
} else if (pv <= 0.001) {
lab <- "***"
col <- "black"
size <- 6
} else if (pv <= 0.01) {
lab <- "**"
col <- "black"
size <- 6
} else {
lab <- "*"
col <- "black"
size <- 6
}
# Positioning
counter <- counter + 1
y_line <- max_y + step * counter
# Add line and label
p <- p +
annotate("segment",
x = i, xend = j,
y = y_line, yend = y_line,
colour = col) +
annotate("text",
x = mean(c(i, j)),
y = y_line + (step * 0.05),
label = lab,
size = size,
colour = col)
}
}
}
print(p)
}
Now plot the different genera
plot_1 = plotCountsGGsigline(dds_hc, gene= "g__Treponema", intgroup = "accession", plot = "box", text = FALSE, showSignificance = TRUE, lineSpacing = 5, groupOrder = c("noma", "DRR", "SRR", "SRS"))
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
plot_2 = plotCountsGGsigline(dds_hc, gene= "g__Bacteroides", intgroup = "accession", plot = "box", text = FALSE, showSignificance = TRUE, lineSpacing = 5, groupOrder = c("noma", "DRR", "SRR", "SRS"))
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
plot_3 = plotCountsGGsigline(dds_hc, gene= "g__Porphyromonas", intgroup = "accession", plot = "box", text = FALSE, showSignificance = TRUE, lineSpacing = 5, groupOrder = c("noma", "DRR", "SRR", "SRS"))
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
plot_4 = plotCountsGGsigline(dds_hc, gene= "g__Selenomonas", intgroup = "accession", plot = "box", text = FALSE, showSignificance = TRUE, lineSpacing = 5, groupOrder = c("noma", "DRR", "SRR", "SRS"))
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
plot_5 = plotCountsGGsigline(dds_hc, gene= "g__Prevotella", intgroup = "accession", plot = "box", text = FALSE, showSignificance = TRUE, lineSpacing = 5, groupOrder = c("noma", "DRR", "SRR", "SRS"))
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
plot_6 = plotCountsGGsigline(dds_hc, gene= "g__Streptococcus", intgroup = "accession", plot = "box", text = FALSE, showSignificance = TRUE, lineSpacing = 5, groupOrder = c("noma", "DRR", "SRR", "SRS"))
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
plot_7 = plotCountsGGsigline(dds_hc, gene= "g__Rothia", intgroup = "accession", plot = "box", text = FALSE, showSignificance = TRUE, lineSpacing = 5, groupOrder = c("noma", "DRR", "SRR", "SRS"))
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
plot_8 = plotCountsGGsigline(dds_hc, gene= "g__Veillonella", intgroup = "accession", plot = "box", text = FALSE, showSignificance = TRUE, lineSpacing = 5, groupOrder = c("noma", "DRR", "SRR", "SRS"))
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
plot_9 = plotCountsGGsigline(dds_hc, gene= "g__Gemella", intgroup = "accession", plot = "box", text = FALSE, showSignificance = TRUE, lineSpacing = 5, groupOrder = c("noma", "DRR", "SRR", "SRS"))
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
plot_10 = plotCountsGGsigline(dds_hc, gene= "g__Schaalia", intgroup = "accession", plot = "box", text = FALSE, showSignificance = TRUE, lineSpacing = 5, groupOrder = c("noma", "DRR", "SRR", "SRS"))
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
plot_11 = plotCountsGGsigline(dds_hc, gene= "g__Actinomyces", intgroup = "accession", plot = "box", text = FALSE, showSignificance = TRUE, lineSpacing = 5, groupOrder = c("noma", "DRR", "SRR", "SRS"))
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
plot_12 = plotCountsGGsigline(dds_hc, gene= "g__Haemophilus", intgroup = "accession", plot = "box", text = FALSE, showSignificance = TRUE, lineSpacing = 5, groupOrder = c("noma", "DRR", "SRR", "SRS"))
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
grid.arrange(plot_1, plot_2, plot_3, plot_4, plot_5,
plot_6, plot_7, plot_8, plot_9, plot_10,
plot_11, plot_12, ncol=3)
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
Random forest (RF) classifiers were applied based on 500 estimator trees to identify key predictors of microbial community composition and their association with noma or healthy states.
We also ran a cross validation of the dataset taking 20% of samples as test data and 80% as training data. This was used to assess the predictive power of the RF classifier, assessed as an area under the receiver operating characteristic (ROC) curve (AUC) between 0 and 1.
A permutation test (n = 1000) was conducted to evaluate the statistical significance of the observed classification accuracy, this allowed for a null distribution of accuracy to be plotted. The observed accuracy of the random forest model was overlayed on the null distribution.
# Extract metadata and OTU table
phyloseq_obj <- phyloseq_noma_esd
metadata <- data.frame(sample_data(phyloseq_obj))
otu_data <- as.data.frame(otu_table(phyloseq_obj))
otu_data <- t(otu_data) # Transpose so samples are rows and species are columns
# Combine OTU data with metadata
ml_data <- cbind(metadata, otu_data)
# Ensure the metadata column you're predicting (e.g., condition) is a factor
ml_data$condition <- as.factor(ml_data$condition) # Replace "condition" with your actual column
# Remove columns containing NA
ml_data_clean <- ml_data[, !is.na(colnames(ml_data))]
ml_data_clean <- ml_data[, !grepl("NA", colnames(ml_data))]
set.seed(123456)
rf_model_cv <- randomForest(condition ~ ., data = ml_data_clean, ntree = 500, importance = TRUE)
print(rf_model_cv)
##
## Call:
## randomForest(formula = condition ~ ., data = ml_data_clean, ntree = 500, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 41
##
## OOB estimate of error rate: 2.7%
## Confusion matrix:
## diseased healthy class.error
## diseased 16 1 0.05882353
## healthy 0 20 0.00000000
importance_rf_model_cv <- importance(rf_model_cv)
# Delete everything from start to Genus
ml_data_col_names = sub(".*_g__","",colnames(ml_data))
# Add Genus back in
ml_data_clean = paste0(paste(rep("g__", length(ml_data_col_names)), ml_data_col_names))
# Delete the space introduced by this
ml_data_col_names = sub(" ","",ml_data_col_names)
ml_data_col_names = sub("g__condition","condition",ml_data_col_names)
head(ml_data_col_names)
## [1] "condition" "g__Coprothermobacter" "g__Caldisericum"
## [4] "g__Desulfurispirillum" "g__Endomicrobium" "g__Elusimicrobium"
ml_data_clean = ml_data
colnames(ml_data_clean) = ml_data_col_names
set.seed(123456)
# Run Random Forest to classify samples based on 'condition'
rf_model <- randomForest(condition ~ ., data = ml_data_clean, ntree = 500, importance = TRUE)
print(rf_model)
##
## Call:
## randomForest(formula = condition ~ ., data = ml_data_clean, ntree = 500, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 41
##
## OOB estimate of error rate: 2.7%
## Confusion matrix:
## diseased healthy class.error
## diseased 16 1 0.05882353
## healthy 0 20 0.00000000
importance_rf_model <- importance(rf_model)
set.seed(123456)
# Run Random Forest to classify samples based on 'condition'
rf_model_condition <- randomForest(condition ~ ., data = ml_data_clean, ntree = 500, importance = TRUE)
print(rf_model_condition)
##
## Call:
## randomForest(formula = condition ~ ., data = ml_data_clean, ntree = 500, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 41
##
## OOB estimate of error rate: 2.7%
## Confusion matrix:
## diseased healthy class.error
## diseased 16 1 0.05882353
## healthy 0 20 0.00000000
importance_rf_model <- importance(rf_model_condition)
head(importance_rf_model)
## diseased healthy MeanDecreaseAccuracy MeanDecreaseGini
## g__Coprothermobacter 0.000000 0.000000 0.000000 0.00000000
## g__Caldisericum 0.000000 0.000000 0.000000 0.00000000
## g__Desulfurispirillum 0.000000 0.000000 0.000000 0.00000000
## g__Endomicrobium 0.000000 0.000000 0.000000 0.00000000
## g__Elusimicrobium 0.000000 0.000000 0.000000 0.00000000
## g__Dictyoglomus 1.001002 1.001002 1.001002 0.03294723
# Plot Variable Importance
# To plot the variable importance, use the %IncMSE (or MeanDecreaseAccuracy)
# from the importance output. Below is an example of how to create a bar plot
# of the top 10 most important predictors:
# Extract variable importance and convert to a data frame
importance_df <- as.data.frame(importance_rf_model)
importance_df <- tibble::rownames_to_column(importance_df, "genus")
importance_df <- importance_df %>%
dplyr::arrange(desc(`MeanDecreaseAccuracy`)) %>%
dplyr::slice_head(n = 50) # Select top 10 most important predictors
# select by any above 1.7
importance_df <- importance_df %>%
filter(`MeanDecreaseAccuracy`>1.7) %>%
arrange(desc(`MeanDecreaseAccuracy`))
importance_df_ggplot = importance_df %>% filter(!grepl("NA", genus))
# Plot the top 10 important predictors
gp = ggplot(importance_df_ggplot, aes(x = reorder(genus, `MeanDecreaseAccuracy`), y = `MeanDecreaseAccuracy`)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() + # Flip to make the plot horizontal
labs(
title = "Top Most Important Genus Predictors in Random Forest",
x = "Genus",
y = "Increase in Mean Squared Error (%IncMSE)"
) +
theme_minimal(base_size = 10)
gp
# Select only the top genera with a relative abundance above 1%
top_Genus_pc$top_Genus_above_0.1 = ifelse(top_Genus_pc$top_Genus_percentage > 1, top_Genus_pc$top_Genus, "-other")
top_Genus_above_1 = unique(top_Genus_pc$top_Genus_above_1)
top_Genus_above_1_nog = gsub("g__", "", top_Genus_above_1)
top_Genus_above_1_nog
## [1] "Prevotella" "Streptococcus" "Neisseria" "Haemophilus"
## [5] "Veillonella" "Rothia" "Fusobacterium" "Treponema"
## [9] "Schaalia" "Escherichia" "Aggregatibacter" "Selenomonas"
## [13] "Leptotrichia" "Actinomyces" "Campylobacter" "Capnocytophaga"
## [17] "Porphyromonas" "Bacteroides" "Gemella" "-other"
# Select genera above 1%
importance_df$genera_above_1pc = ifelse(importance_df$genus %in% top_Genus_above_1, importance_df$genus, "-other")
unique(importance_df$genera_above_1pc)
## [1] "-other" "g__Treponema" "g__Streptococcus" "g__Rothia"
## [5] "g__Schaalia" "g__Bacteroides" "g__Veillonella"
importance_df
## genus diseased healthy MeanDecreaseAccuracy MeanDecreaseGini
## 1 g__Bradyrhizobium 3.660392 3.557975 3.673806 0.50996139
## 2 g__Nitratifractor 2.886977 2.997524 3.030241 0.33400705
## 3 g__Treponema 2.962498 2.911517 2.991059 0.36043243
## 4 g__Wolinella 2.801180 2.802402 2.981173 0.28526589
## 5 g__Streptococcus 2.787027 2.779934 2.915069 0.28964736
## 6 g__Mesorhizobium 2.790511 2.761828 2.881425 0.29020077
## 7 g__Rothia 2.599118 2.761250 2.823918 0.28268074
## 8 g__Simiduia 2.549200 2.864130 2.805329 0.28998448
## 9 g__Halothiobacillus 2.496354 2.689688 2.755405 0.26437303
## 10 g__Thiomicrospira 2.601722 2.658815 2.726651 0.24352427
## 11 g__Salinispira 2.521035 2.350143 2.625674 0.23040216
## 12 g__Methylomonas 2.579653 2.470100 2.543027 0.24927539
## 13 g__Helicobacter 2.088690 2.232992 2.531597 0.18590805
## 14 g__Schaalia 2.144130 2.413290 2.495430 0.19663482
## 15 g__Gluconobacter 2.227086 2.517391 2.437368 0.23894843
## 16 g__Spirochaeta 1.722526 2.028057 2.437185 0.19821883
## 17 g__Apibacter 2.037999 2.284614 2.397257 0.17730516
## 18 g__Prosthecochloris 2.216696 2.469697 2.394048 0.25998720
## 19 g__Spiribacter 2.297540 2.117724 2.272871 0.18288168
## 20 g__Serpentinomonas 1.936345 1.962885 2.260316 0.17640499
## 21 g__Geoalkalibacter 1.841104 2.264631 2.251752 0.14044622
## 22 g__Sphaerochaeta 2.206124 2.103478 2.196250 0.17029541
## 23 g__Streptobacillus 2.119667 2.160702 2.176204 0.15830508
## 24 g__Agarilytica 2.370907 1.749595 2.169279 0.16377581
## 25 g__Thalassospira 1.877006 2.045678 2.136811 0.13782816
## 26 g__Granulibacter 1.906361 2.119751 2.130254 0.14356757
## 27 g__Borreliella 1.930511 2.138987 2.088895 0.16342194
## 28 g__Chloroherpeton 2.142411 1.954972 2.082952 0.10610174
## 29 g__Niabella 2.141765 1.951497 2.039544 0.17989189
## 30 g__Bacteroides 1.999736 1.857074 2.012205 0.17357101
## 31 g__Leptospirillum 1.349602 1.697325 1.997624 0.13054504
## 32 g__Aquabacterium 1.919154 2.001198 1.989838 0.09464830
## 33 g__Nitrosococcus 1.732312 2.112943 1.977565 0.17081648
## 34 g__Nitrosomonas 1.936378 1.925083 1.976378 0.12559560
## 35 g__Salinimonas 2.003268 1.768322 1.967160 0.13122178
## 36 g__Microvirga 1.878808 1.859586 1.960088 0.14039013
## 37 g__Bdellovibrio 1.939976 1.864312 1.954875 0.13583473
## 38 g__Labilithrix 1.895867 1.650699 1.936699 0.11105688
## 39 g__Ruminiclostridium 1.795386 1.904024 1.935427 0.13083564
## 40 g__Kosakonia 1.879982 1.956399 1.930921 0.11751968
## 41 g__Hydrogenophaga 1.727020 1.937971 1.870173 0.10838049
## 42 g__Muricauda 1.936378 1.775070 1.868046 0.11144237
## 43 g__Sulfurospirillum 1.637280 1.902655 1.852548 0.11252056
## 44 g__Aromatoleum 2.036208 1.229737 1.813548 0.15655335
## 45 g__Hydromonas 1.411208 1.869259 1.774088 0.06547688
## 46 g__Methylovulum 1.687840 1.707430 1.730118 0.10702703
## 47 g__Veillonella 1.570559 1.659584 1.723172 0.08020104
## 48 g__Atopobium 1.417051 1.599778 1.722366 0.03950574
## 49 g__Jonquetella 1.723004 1.417051 1.703829 0.13368684
## genera_above_1pc
## 1 -other
## 2 -other
## 3 g__Treponema
## 4 -other
## 5 g__Streptococcus
## 6 -other
## 7 g__Rothia
## 8 -other
## 9 -other
## 10 -other
## 11 -other
## 12 -other
## 13 -other
## 14 g__Schaalia
## 15 -other
## 16 -other
## 17 -other
## 18 -other
## 19 -other
## 20 -other
## 21 -other
## 22 -other
## 23 -other
## 24 -other
## 25 -other
## 26 -other
## 27 -other
## 28 -other
## 29 -other
## 30 g__Bacteroides
## 31 -other
## 32 -other
## 33 -other
## 34 -other
## 35 -other
## 36 -other
## 37 -other
## 38 -other
## 39 -other
## 40 -other
## 41 -other
## 42 -other
## 43 -other
## 44 -other
## 45 -other
## 46 -other
## 47 g__Veillonella
## 48 -other
## 49 -other
importance_df_to_plot = importance_df %>% filter(genera_above_1pc != "-other") %>% filter(!grepl("NA", genus))
# Plot the top 10 important predictors above 0.1%
gp2 = ggplot(importance_df_to_plot, aes(x = reorder(genera_above_1pc, `MeanDecreaseAccuracy`), y = `MeanDecreaseAccuracy`)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() + # Flip to make the plot horizontal
labs(
title = "Top Most Important Genera Predictors",
x = "Genera above 1% overall abundance",
y = "Increase in Mean Squared Error (%IncMSE)"
) +
theme_minimal(base_size = 10)
gp2
otu_table_g = as.data.frame(otu_table(phyloseq_obj))
colnames(otu_table_g) <- gsub(".*(g__*)", "\\1", colnames(otu_table_g))
colnames(otu_table_g)
## [1] "A10" "A11" "A13" "A14" "A16"
## [6] "A17" "A18" "A19" "A1" "A2"
## [11] "A3" "A4" "A5" "A6" "A7"
## [16] "A8" "A9" "DRR214959" "DRR214960" "DRR214961"
## [21] "DRR214962" "DRR241310" "SRR5892208" "SRR5892209" "SRR5892210"
## [26] "SRR5892211" "SRR5892212" "SRR5892213" "SRR5892214" "SRR5892215"
## [31] "SRR5892216" "SRR5892217" "SRS013942" "SRS014468" "SRS014692"
## [36] "SRS015055" "SRS019120"
# Remove columns containing NA
otu_table_g <- otu_table_g[, !is.na(colnames(otu_table_g))]
otu_table_g <- otu_table_g[, !grepl("NA", colnames(otu_table_g))]
colSums(is.na(otu_table_g))
## A10 A11 A13 A14 A16 A17 A18
## 0 0 0 0 0 0 0
## A19 A1 A2 A3 A4 A5 A6
## 0 0 0 0 0 0 0
## A7 A8 A9 DRR214959 DRR214960 DRR214961 DRR214962
## 0 0 0 0 0 0 0
## DRR241310 SRR5892208 SRR5892209 SRR5892210 SRR5892211 SRR5892212 SRR5892213
## 0 0 0 0 0 0 0
## SRR5892214 SRR5892215 SRR5892216 SRR5892217 SRS013942 SRS014468 SRS014692
## 0 0 0 0 0 0 0
## SRS015055 SRS019120
## 0 0
#otu_table_g = t(otu_table_g)
condition = as.data.frame(metadata$condition)
rownames(condition) = rownames(metadata)
otu_table_g = t(otu_table_g)
otu_table_g <- cbind(condition, otu_table_g)
# Split data into training and test sets (80/20 split)
set.seed(123456)
train_index <- sample(1:nrow(otu_table_g), 0.8 * nrow(otu_table_g))
train_data <- otu_table_g[train_index, ]
test_data <- otu_table_g[-train_index, ]
dim(train_data) # Check dimensions of your dataset
## [1] 29 1758
dim(test_data)
## [1] 8 1758
train_data$`metadata$condition` <- as.factor(train_data$`metadata$condition`)
# Train Random Forest
rf_model_2 <- randomForest(`metadata$condition` ~ ., data = train_data, importance = TRUE)
# Predict on test set
pred <- predict(rf_model_2, test_data, type = "prob")[, 2]
# Evaluate model performance
roc_obj <- roc(test_data$`metadata$condition`, pred)
## Setting levels: control = diseased, case = healthy
## Setting direction: controls < cases
roc_plot = plot(roc_obj, main = "ROC Curve for Random Forest")
roc_plot
##
## Call:
## roc.default(response = test_data$`metadata$condition`, predictor = pred)
##
## Data: pred in 5 controls (test_data$`metadata$condition` diseased) < 3 cases (test_data$`metadata$condition` healthy).
## Area under the curve: 1
auc(roc_obj)
## Area under the curve: 1
# Load required libraries
# Split data into training and testing sets
# This is a crucial step you were missing
set.seed(123456) # For reproducibility
train_index <- createDataPartition(ml_data_clean$condition, p = 0.8, list = FALSE)
train_data <- ml_data_clean[train_index, ]
test_data <- ml_data_clean[-train_index, ]
# Train Random Forest model on training data
rf_model_cv <- randomForest(condition ~ .,
data = train_data,
ntree = 500,
importance = TRUE)
# Print model summary
print(rf_model_cv)
##
## Call:
## randomForest(formula = condition ~ ., data = train_data, ntree = 500, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 41
##
## OOB estimate of error rate: 3.33%
## Confusion matrix:
## diseased healthy class.error
## diseased 13 1 0.07142857
## healthy 0 16 0.00000000
# Variable importance
importance_rf_model_cv <- importance(rf_model_cv)
varImpPlot(rf_model_cv, main = "Variable Importance")
# Evaluate on testing data - never evaluate on the same data you trained on
predictions_prob <- predict(rf_model_cv, test_data, type = "prob")
# Check structure of predictions
str(predictions_prob)
## 'matrix' num [1:7, 1:2] 0.892 0.826 0.658 0.026 0.066 0.05 0.162 0.108 0.174 0.342 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:7] "A11" "A3" "A7" "DRR214962" ...
## ..$ : chr [1:2] "diseased" "healthy"
# Extract probabilities for the positive class (assuming binary classification)
# If condition has levels "No" and "Yes" or "0" and "1"
# Get the column name of the positive class (typically the second column)
positive_class <- colnames(predictions_prob)[2]
prob_positive <- predictions_prob[, positive_class]
# Create class predictions based on probability threshold
threshold <- 0.5
predictions_class <- factor(ifelse(prob_positive > threshold,
levels(test_data$condition)[2],
levels(test_data$condition)[1]),
levels = levels(test_data$condition))
# Create confusion matrix
conf_matrix <- confusionMatrix(predictions_class, test_data$condition)
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction diseased healthy
## diseased 3 0
## healthy 0 4
##
## Accuracy : 1
## 95% CI : (0.5904, 1)
## No Information Rate : 0.5714
## P-Value [Acc > NIR] : 0.01989
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.4286
## Detection Rate : 0.4286
## Detection Prevalence : 0.4286
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : diseased
##
# ROC curve using ROCR
# For ROCR, we need numeric labels (convert factor to numeric)
numeric_labels <- as.numeric(test_data$condition) - 1 # Convert to 0/1
roc_pred <- prediction(predictions = prob_positive, labels = numeric_labels)
roc_perf <- performance(roc_pred, "tpr", "fpr")
# Plot ROC curve
plot(roc_perf,
colorize = TRUE,
print.cutoffs.at = seq(0, 1, 0.1),
text.adj = c(-0.2, 1.7),
main = "ROC Curve")
abline(a = 0, b = 1, lty = 2, col = "gray")
# Calculate AUC
auc_ROCR <- performance(roc_pred, measure = "auc")
auc_value <- auc_ROCR@y.values[[1]]
print(paste("AUC:", round(auc_value, 4)))
## [1] "AUC: 1"
# Alternative ROC curve using pROC package
roc_curve <- roc(response = test_data$condition,
predictor = prob_positive)
## Setting levels: control = diseased, case = healthy
## Setting direction: controls < cases
plot(roc_curve, main = "ROC Curve using pROC")
print(paste("AUC (pROC):", round(auc(roc_curve), 4)))
## [1] "AUC (pROC): 1"
# Print performance metrics
accuracy <- conf_matrix$overall["Accuracy"]
sensitivity <- conf_matrix$byClass["Sensitivity"]
specificity <- conf_matrix$byClass["Specificity"]
print(paste("Accuracy:", round(accuracy, 4)))
## [1] "Accuracy: 1"
print(paste("Sensitivity:", round(sensitivity, 4)))
## [1] "Sensitivity: 1"
print(paste("Specificity:", round(specificity, 4)))
## [1] "Specificity: 1"
# Ensure column names are correctly referenced
set.seed(123456)
train_data$`metadata$condition`
## NULL
colnames(train_data)[colnames(train_data) == "metadata$condition"] <- "condition"
colnames(test_data)[colnames(test_data) == "metadata$condition"] <- "condition"
train_data$condition
## [1] diseased diseased diseased diseased diseased diseased diseased diseased
## [9] diseased diseased diseased diseased diseased diseased healthy healthy
## [17] healthy healthy healthy healthy healthy healthy healthy healthy
## [25] healthy healthy healthy healthy healthy healthy
## Levels: diseased healthy
test_data$condition
## [1] diseased diseased diseased healthy healthy healthy healthy
## Levels: diseased healthy
# Permutation testing
null_accuracies <- replicate(10, {
# Shuffle the labels
perm_labels <- sample(train_data$condition) # Use the actual column name for condition
train_data_perm <- train_data
train_data_perm$condition <- perm_labels # Replace condition labels with permuted ones
# Train random forest on permuted data
rf_perm <- randomForest(condition ~ ., data = train_data_perm, importance = TRUE)
# Test on test_data
mean(predict(rf_perm, test_data) == test_data$condition) # Ensure test_data$condition matches
})
# Compare observed accuracy to null distribution
observed_accuracy <- mean(predict(rf_model_cv, test_data) == test_data$condition)
# Visualize and interpret
# 2. Create the plot
hist(null_accuracies, main = "Null Distribution of Accuracy", xlab = "Accuracy")
abline(v = observed_accuracy, col = "red", lwd = 2)
Here we run ordination of multivariate data into two dimensions using the unsupervised learning techniques; Principal Coordinate Analysis (PCoA), Non-metric Multidimensional Scaling (NMDS), and Redundancy Analysis (RDA) all with Bray-Curtis dissimilarity.
phyloseq_obj
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 1757 taxa and 37 samples ]
## sample_data() Sample Data: [ 37 samples by 1 sample variables ]
## tax_table() Taxonomy Table: [ 1757 taxa by 7 taxonomic ranks ]
phyloseq_obj@sam_data[["condition"]] = as.factor(phyloseq_obj@sam_data[["condition"]])
# Calculate Bray-Curtis dissimilarity
dist_bc <- phyloseq::distance(phyloseq_obj, method = "bray")
dist_bc <- phyloseq::distance(phyloseq_obj, method = "bray")
adonis2(dist_bc ~ condition, data = as(sample_data(phyloseq_obj), "data.frame"))
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = dist_bc ~ condition, data = as(sample_data(phyloseq_obj), "data.frame"))
## Df SumOfSqs R2 F Pr(>F)
## Model 1 2.5163 0.40556 23.879 0.001 ***
## Residual 35 3.6881 0.59444
## Total 36 6.2044 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Visualize PCoA
ordination_results <- ordinate(phyloseq_obj, method = "PCoA", distance = "bray")
# Convert ordination to a data frame
ordination_df <- as.data.frame(ordination_results$vectors)
rownames(ordination_df) = rownames(ordination_results$vectors)
ordination_df$condition <- sample_data(phyloseq_obj)$condition
head(ordination_df)
## Axis.1 Axis.2 Axis.3 Axis.4 Axis.5 Axis.6
## A10 -0.3990528 -0.2763281 -0.02763561 0.01750740 -0.138888422 0.0500316957
## A11 -0.2163923 0.1093911 0.08424231 -0.01368495 0.077659141 0.1039769612
## A13 -0.3788201 -0.0812252 0.02994839 0.04811056 -0.024370993 0.0032111610
## A14 -0.2237416 0.1484954 0.07359393 -0.02273669 0.066676250 0.0005697727
## A16 -0.1909393 0.3327742 -0.02746487 0.04747409 0.005856121 0.1327296103
## A17 -0.2253981 0.1666075 -0.11150852 0.18120400 0.321480041 -0.1972564933
## Axis.7 Axis.8 Axis.9 Axis.10 Axis.11 Axis.12
## A10 -0.014754793 -0.06335845 0.105871797 0.019766348 0.03963333 -0.045911689
## A11 0.074412305 0.09558781 -0.043101902 -0.065767576 0.02368048 0.009730679
## A13 0.062739465 -0.07794522 0.006988258 0.052129777 0.01741799 -0.026402558
## A14 0.012886279 0.10306105 0.071850753 0.038935393 -0.03672565 -0.094311398
## A16 -0.044221595 0.02505884 -0.053434452 -0.039630828 0.10089014 0.041121914
## A17 -0.005963391 -0.07225894 0.074203753 -0.002761451 0.01035562 0.015650505
## Axis.13 Axis.14 Axis.15 Axis.16 Axis.17
## A10 -0.007735055 0.0243225804 -0.0500978722 0.002754255 0.0141803813
## A11 0.028719219 0.0468579695 -0.0008639627 0.013852955 0.0001041077
## A13 0.090486148 -0.0029947033 -0.0005909820 -0.037640174 0.0089074175
## A14 -0.021976023 -0.0189616196 -0.0040817690 -0.017059139 0.0036328221
## A16 -0.036876441 0.0002420351 0.0005302102 -0.001988632 0.0121336219
## A17 -0.003764348 0.0045354662 -0.0201016169 0.016929575 -0.0135230435
## Axis.18 Axis.19 Axis.20 Axis.21 Axis.22 condition
## A10 -0.0204580789 -0.003451114 0.020110360 0.015963667 -0.008437933 diseased
## A11 -0.0039070040 -0.022333723 -0.001090578 -0.001483207 -0.002401530 diseased
## A13 -0.0053896699 -0.013526270 -0.003312557 -0.001061440 -0.002060376 diseased
## A14 0.0251298650 0.012571149 -0.001016701 0.001364480 0.007975996 diseased
## A16 -0.0009390817 -0.001842882 0.004423927 0.005939336 -0.004796995 diseased
## A17 0.0006735671 -0.001796595 0.001398673 -0.001108894 -0.001930982 diseased
# Plot using ggplot2
PCoA_plot = ggplot(ordination_df, aes(x = Axis.1, y = Axis.2, color = condition)) +
geom_point(size = 4) +
theme_minimal() +
labs(x = "PCoA1", y = "PCoA2") +
ggtitle("PCoA") +
theme_minimal(base_size = 10)
PCoA_plot
ordination_results <- ordinate(phyloseq_obj, method = "NMDS", distance = "bray")
## Square root transformation
## Wisconsin double standardization
## Run 0 stress 0.102349
## Run 1 stress 0.1218075
## Run 2 stress 0.1219462
## Run 3 stress 0.102349
## ... New best solution
## ... Procrustes: rmse 0.0001243376 max resid 0.0006357427
## ... Similar to previous best
## Run 4 stress 0.1310319
## Run 5 stress 0.102349
## ... New best solution
## ... Procrustes: rmse 9.059102e-05 max resid 0.0004721109
## ... Similar to previous best
## Run 6 stress 0.1412518
## Run 7 stress 0.1219459
## Run 8 stress 0.102349
## ... New best solution
## ... Procrustes: rmse 5.234615e-05 max resid 0.0002581765
## ... Similar to previous best
## Run 9 stress 0.1604445
## Run 10 stress 0.102349
## ... Procrustes: rmse 5.683581e-05 max resid 0.0002887745
## ... Similar to previous best
## Run 11 stress 0.1507048
## Run 12 stress 0.1197668
## Run 13 stress 0.1307239
## Run 14 stress 0.102349
## ... New best solution
## ... Procrustes: rmse 2.545909e-05 max resid 0.0001010183
## ... Similar to previous best
## Run 15 stress 0.1197667
## Run 16 stress 0.1473149
## Run 17 stress 0.1219461
## Run 18 stress 0.1519854
## Run 19 stress 0.1561846
## Run 20 stress 0.1561631
## *** Best solution repeated 1 times
# Convert ordination to a data frame
ordination_df <- as.data.frame(ordination_results$points)
rownames(ordination_df) = rownames(ordination_results$points)
ordination_df$condition <- sample_data(phyloseq_obj)$condition
head(ordination_df)
## MDS1 MDS2 condition
## A10 -0.06261851 0.105930840 diseased
## A11 -0.07493600 0.005848067 diseased
## A13 -0.05958729 0.074261637 diseased
## A14 -0.12131611 0.035256716 diseased
## A16 -0.11960044 -0.006426233 diseased
## A17 -0.25741911 0.112212205 diseased
# Plot using ggplot2
NMDS_plot = ggplot(ordination_df, aes(x = MDS1, y = MDS2, color = condition)) +
geom_point(size = 4) +
labs(x = "MDS1", y = "MDS2") +
ggtitle("NMDS") +
theme_minimal(base_size = 10)
NMDS_plot
# Visualize PCoA
ordination_results <- ordinate(phyloseq_obj, method = "RDA", distance = "bray")
# Convert ordination to a data frame
ordination_df <- as.data.frame(ordination_results[["CA"]][["u"]])
rownames(ordination_df) = rownames(ordination_results[["CA"]][["u"]])
ordination_df$condition <- sample_data(phyloseq_obj)$condition
ordination_df$sample_names = rownames(ordination_df)
head(ordination_df)
## PC1 PC2 PC3 PC4 PC5 PC6
## A10 -0.33204284 0.20912012 -0.144253041 -0.03059961 0.23588105 -0.03713143
## A11 -0.08468054 -0.05827521 0.091839883 -0.03904873 -0.03025444 0.11074921
## A13 -0.21986915 0.05864669 -0.005890016 -0.06419824 0.11837347 0.01683602
## A14 -0.10014296 -0.07028627 0.129444325 -0.03692765 -0.06995082 -0.05230855
## A16 -0.03148699 -0.19503334 0.263923709 -0.14196187 0.07884507 0.16475294
## A17 -0.04975940 -0.15219201 0.110139728 -0.25735315 -0.72081181 -0.17642925
## PC7 PC8 PC9 PC10 PC11 PC12
## A10 -0.02944898 0.25566690 -0.306760531 0.124883398 -0.12498715 0.16274571
## A11 0.01094340 -0.15876290 0.115473979 -0.367725699 -0.16103047 0.05711542
## A13 0.02455014 0.04861554 -0.109757220 0.007177984 -0.18117784 -0.42471427
## A14 -0.10766862 0.28908187 -0.164363766 0.068477770 0.10221473 0.21414285
## A16 -0.03183595 -0.11431324 0.176763963 -0.080347004 -0.16250451 0.27998110
## A17 -0.11097290 0.11671967 -0.002509756 0.347931019 -0.08731619 -0.19967489
## PC13 PC14 PC15 PC16 PC17 PC18
## A10 -0.095156533 0.09453161 -0.12202847 0.020785156 0.03601123 -0.1181814
## A11 -0.007034822 0.15933808 -0.07668232 0.112010102 -0.12134268 -0.1053740
## A13 -0.576576384 0.18839905 0.02470430 0.086624203 0.05976665 -0.1154655
## A14 0.165565336 -0.03733827 0.18188986 -0.148754274 0.18201159 -0.1773570
## A16 -0.059880870 0.26900679 -0.28375716 -0.031229954 -0.11600837 -0.2700773
## A17 -0.045742598 0.03633465 -0.01428297 0.009103755 -0.13884746 0.1092161
## PC19 PC20 PC21 PC22 PC23 PC24
## A10 -0.22109978 0.13269048 0.16100239 0.097298537 0.06213204 0.03879865
## A11 -0.01730906 0.46492550 -0.16470574 0.271570983 0.33759611 0.12054865
## A13 0.10220793 -0.14417527 -0.06732180 -0.122297369 0.16478915 -0.27003072
## A14 -0.12276290 -0.13931812 0.08742432 -0.030674244 -0.01195781 -0.02473966
## A16 -0.11544395 -0.03174451 0.02565170 -0.006926064 -0.51149227 -0.18306242
## A17 -0.06962461 0.14244117 -0.10618803 0.080501463 -0.07313958 -0.07749316
## PC25 PC26 PC27 PC28 PC29 PC30
## A10 -0.42812319 -0.21903386 -0.0007227003 -0.02340901 -0.02632542 -0.16877654
## A11 -0.02948713 -0.02394383 0.0017440496 0.21644163 -0.18301587 0.31180164
## A13 0.12137868 0.17231373 0.1538940397 -0.05300262 0.06280725 0.08988459
## A14 0.10997873 -0.17703706 0.3085810106 0.31629916 0.11977402 0.43975276
## A16 0.08708884 -0.08039283 -0.0044842244 -0.21725513 0.01552273 -0.10433124
## A17 -0.12309220 -0.05488621 -0.0601495289 0.03133817 -0.02899176 -0.06770742
## PC31 PC32 PC33 PC34 PC35 PC36
## A10 -0.03695492 -0.22395802 -0.04865635 -0.181440709 -0.037170808 -0.131063826
## A11 -0.01621193 -0.05244742 0.05173978 0.173070007 0.064738719 -0.042649428
## A13 -0.07342503 0.16770515 0.01757990 0.050672094 0.018517629 0.105086390
## A14 0.02943466 0.31496951 -0.06218461 0.074158512 0.013174469 -0.024876276
## A16 0.02869031 0.12802347 -0.09138193 0.042254488 -0.004793286 -0.020223887
## A17 -0.01490583 -0.06973480 -0.03553475 -0.007599638 0.002019312 0.005315909
## condition sample_names
## A10 diseased A10
## A11 diseased A11
## A13 diseased A13
## A14 diseased A14
## A16 diseased A16
## A17 diseased A17
# Plot using ggplot2
RDA_plot = ggplot(ordination_df, aes(x = PC1, y = PC2, color = condition)) +
geom_point(size = 4) +
labs(x = "PC1", y = "PC2") +
ggtitle("RDA") +
theme_minimal(base_size = 10)
RDA_plot
Here we plotted a heatmap displaying the z-score standardized centered log-ratio (CLR) of the top 20 most abundant genera.
# Ensure Genus
tse_genus <- altExp(tse_metaphlan, "Genus")
# Add clr-transformation (centred-lofg ratio)
tse_genus <- transformAssay(tse_genus, assay.type = "counts", method = "relabundance")
tse_genus <- transformAssay(tse_genus, assay.type = "relabundance",
method = "clr", pseudocount = TRUE)
## A pseudocount of 7.6187051401118e-09 was applied.
# Apply a z-transformation across rows (taxa)
tse_genus <- transformAssay(tse_genus, assay.type = "clr",
MARGIN = "features",
method = "standardize", name = "clr_z")
# Get 20 most abundant genera, and subsets the data by them
top_20_genus <- names(sort(total_relabundance_Genus, decreasing = TRUE)[1:20])
tse_genus_subset <- tse_genus[top_20_genus, ]
# Calculate the proportion of bacteria accounted for in the top 20 Genera
(sum(assay(tse_genus_subset, "relabundance"))) / 15
## [1] 2.226121
# delete to _g__
rownames(tse_genus_subset) <- gsub(".*(g__*)", "\\1", rownames(tse_genus_subset))
# Gets the assay table
mat <- assay(tse_genus_subset, "clr_z")
# make a dataframe for sample information
sample_data <- as.data.frame(colData(tse_genus_subset))
sample_data <- sample_data[, c("condition"), drop = FALSE]
# make a dataframe for taxonomic information
taxonomic_data <- as.data.frame(rowData(tse_genus_subset))
taxonomic_data <- taxonomic_data[, "Phylum", drop = FALSE]
# set heatmap scaling and colors
breaks <- seq(-ceiling(max(abs(mat))), ceiling(max(abs(mat))),
length.out = ifelse( max(abs(mat))>5, 2*ceiling(max(abs(mat))), 10 ) )
colors <- colorRampPalette(c("darkblue", "blue", "white", "red", "darkred"))(length(breaks)-1)
colors <- colorRampPalette(c("#1E3A5F", "#4682B4", "#FFFFFF", "#E41A1C", "#A11618"))(length(breaks) - 1)
# set column annotation colors
ann_colors <- list(
condition = c("healthy" = "#4682B4", "diseased" = "#E41A1C"),
taxonomic_data = c(brewer.pal(6, "Dark2"))
)
# create the heatmap
genus_clusters = pheatmap(mat,
annotation_col = sample_data,
annotation_row = taxonomic_data,
annotation_colors = ann_colors,
color = colors,
breaks = breaks,
clustering_distance_cols = "euclidean",
clustering_method = "complete")
# print plot
genus_clusters
# Combine CLR z-scores with Sample Conditions
# Combine mat and sample_data
clr_z_with_condition <- as.data.frame(t(mat))
clr_z_with_condition$condition <- sample_data$condition
# Specify genera of interest
genera_of_interest <- top_Genus_above_1
# Identify genera present in the CLR z-score data
valid_genera <- intersect(genera_of_interest, colnames(clr_z_with_condition))
valid_genera = c(valid_genera, "g__Filifactor")
# Subset for valid genera and condition
clr_z_subset <- clr_z_with_condition[, c(valid_genera, "condition")]
# Calculate mean z-scores
mean_clr_z_scores <- clr_z_subset %>%
dplyr::group_by(condition) %>%
dplyr::summarise(across(everything(), ~ mean(.x, na.rm = TRUE), .names = "mean_{.col}"))
# Calculate means
mean_clr_z_scores <- clr_z_subset %>%
dplyr::group_by(condition) %>%
dplyr::summarise(across(all_of(valid_genera), ~ mean(.x, na.rm = TRUE)))
# View the result
print(mean_clr_z_scores)
## # A tibble: 2 × 21
## condition g__Prevotella g__Streptococcus g__Neisseria g__Haemophilus
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 diseased 0.129 -0.940 0.196 -0.323
## 2 healthy -0.110 0.799 -0.166 0.275
## # ℹ 16 more variables: g__Veillonella <dbl>, g__Rothia <dbl>,
## # g__Fusobacterium <dbl>, g__Treponema <dbl>, g__Schaalia <dbl>,
## # g__Escherichia <dbl>, g__Aggregatibacter <dbl>, g__Selenomonas <dbl>,
## # g__Leptotrichia <dbl>, g__Actinomyces <dbl>, g__Campylobacter <dbl>,
## # g__Capnocytophaga <dbl>, g__Porphyromonas <dbl>, g__Bacteroides <dbl>,
## # g__Gemella <dbl>, g__Filifactor <dbl>
# Convert to long format for ggplot
clr_z_long <- mean_clr_z_scores %>%
pivot_longer(cols = -condition, names_to = "Genus", values_to = "Mean_Z_Score")
Mean_CLR_Z_Score = ggplot(clr_z_long, aes(x = Mean_Z_Score, y = condition, fill = condition)) +
geom_bar(stat = "identity", position = position_dodge(), color = "black") + # Black border around bars
facet_wrap(~ Genus, scales = "free_y", ncol = 1) + # Separate panels for each genus
labs(x = "Condition", y = "Mean CLR Z-Score", fill = "Condition", title = "Mean CLR Z-Scores by Genus and Condition") +
theme_minimal(base_size = 5) +
theme(panel.border = element_rect(color = "black", fill = NA, size = 1)) # Add a border around each panel
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Mean_CLR_Z_Score